You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am defining a custom evaluator with ragas, it keeps showing list index out of range error from trace, but I tested the ragas evaluate function separately, it works fine, and the dataformat, run output are right. Below is the code snippet and pic of error, I removed some parameter value. Is it because some callback trace needed when submit run on langsmith platform?
from langchain_core.messages import AIMessage
import numpy as np
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langsmith.schemas import Run, Example
from ragas.metrics import Faithfulness, FactualCorrectness
from ragas import evaluate
from datasets import Dataset
from ragas.run_config import RunConfig
import os
from dotenv import load_dotenv
from ragas.llms import LangchainLLMWrapper
from ragas import SingleTurnSample, EvaluationDataset
from ragas.embeddings import LangchainEmbeddingsWrapper
def create_critic_llm():
"""Create a new instance of the critic LLM for each evaluation."""
return LangchainLLMWrapper(AzureChatOpenAI(
azure_deployment=,
api_version=,
temperature=,
max_tokens=None,
timeout=None,
max_retries=,
))
def create_embeddings():
return LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(model=))
def result_evaluate(data_sample,metric_name):
critic_llm=create_critic_llm()
ada_002_embeddings=create_embeddings()
dataset = EvaluationDataset(samples=[data_sample])
if metric_name == 'factual_correctness':
metrics=[FactualCorrectness(llm=critic_llm)]
if metric_name == 'faithfulness':
metrics=[Faithfulness(llm=critic_llm)]
score = evaluate(embeddings=ada_002_embeddings,dataset=dataset,metrics=metrics,show_progress=False,run_config=RunConfig(max_retries=1,log_tenacity=True))
return np.round(score[metric_name],4)[0]
def q_correctness(run: Run, example: Example) -> dict:
"""An example evaluator. Larger numbers are better."""
# The Example object contains the inputs and reference labels from a single row in your dataset (if provided).
prompt_inputs = example.inputs
reference_outputs = example.outputs # aka labels
# often comparing them to the reference_outputs
predicted: AIMessage = run.outputs["output"]
data_sample=SingleTurnSample(user_input=prompt_inputs['question'],
retrieved_contexts=prompt_inputs['contexts'],
response=str(predicted.content.strip()),
reference=reference_outputs['answer'])
score=result_evaluate(data_sample,'factual_correctness')
# Implement your evaluation logic here
return {
# The evaluator keys here define the metric you are measuring
# You can provide further descriptions for these in the config.json
"key": "q_correctness",
"score": score,
}
The text was updated successfully, but these errors were encountered:
I am defining a custom evaluator with ragas, it keeps showing list index out of range error from trace, but I tested the ragas evaluate function separately, it works fine, and the dataformat, run output are right. Below is the code snippet and pic of error, I removed some parameter value. Is it because some callback trace needed when submit run on langsmith platform?
The text was updated successfully, but these errors were encountered: