Evaluations code
Evaluating the Quality of RAG Nodes
Importance of Evaluation
Using LLMs for Evaluation
Example Code: Evaluating Relevance
from dynamiq.components.evaluators.llm_evaluator import LLMEvaluator
from dynamiq.nodes.llms import BaseLLM, OpenAI
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
def run_relevance_to_search_query(llm: BaseLLM):
instruction_text = """
Evaluate the relevance of the "Answer" to the "Search Query".
- Score the relevance from 0 to 1.
- Use 1 if the Answer directly addresses the Search Query.
- Use 0 if the Answer is irrelevant to the Search Query.
- Provide a brief justification for the score.
"""
evaluator = LLMEvaluator(
instructions=instruction_text.strip(),
inputs=[
("search_queries", list[str]),
("answers", list[str]),
],
outputs=["relevance_score"],
examples=[
{
"inputs": {
"search_queries": "Best Italian restaurants in New York",
"answers": "Here are the top-rated Italian restaurants in New York City...",
},
"outputs": {"relevance_score": 1},
},
{
"inputs": {
"search_queries": "Weather forecast for tomorrow",
"answers": "Apple released a new iPhone model today.",
},
"outputs": {"relevance_score": 0},
},
],
llm=llm,
)
search_queries = [
"How to bake a chocolate cake?",
"What is the capital of France?",
"Latest news on technology.",
]
answers = [
"To bake a chocolate cake, you need the following ingredients...",
"The capital of France is Paris.",
"The weather today is sunny with a chance of rain.",
]
results = evaluator.run(search_queries=search_queries, answers=answers)
return results
# Example usage with an OpenAI LLM:
if __name__ == "__main__":
llm = OpenAI(model="gpt-4o-mini")
relevance_results = run_relevance_to_search_query(llm)
print("Answer Relevance to Search Query Results:")
print(relevance_results)
# Output: Answer Relevance to Search Query Results: {'results': [{'relevance_score': 1}, {'relevance_score': 1}, {'relevance_score': 0}]}Example Code: Evaluating Correctness
Last updated