diff --git a/prepare/cards/ragbench_faithfulness.py b/prepare/cards/ragbench_faithfulness.py new file mode 100644 index 0000000000..0e718cd2a3 --- /dev/null +++ b/prepare/cards/ragbench_faithfulness.py @@ -0,0 +1,41 @@ +from unitxt import add_to_catalog +from unitxt.blocks import ( + LoadHF, + TaskCard, +) +from unitxt.operators import Copy, ExecuteExpression +from unitxt.templates import NullTemplate + +for subset in [ + "covidqa", + "cuad", + "delucionqa", + "emanual", + "expertqa", + "finqa", + "hagrid", + "hotpotqa", + "msmarco", + "pubmedqa", + "tatqa", + "techqa", +]: + card = TaskCard( + loader=LoadHF( + path="rungalileo/ragbench", + name=subset, + split="test" + ), + preprocess_steps=[ + Copy(field="response", to_field="answer"), + Copy(field="documents", to_field="contexts"), + ExecuteExpression(expression="int(adherence_score)", to_field="number_val"), + ExecuteExpression(expression="['yes' if adherence_score else 'no']", to_field="is_faithful"), + ], + task="tasks.rag_eval.faithfulness.binary", + templates={"default": NullTemplate()}, + ) + + add_to_catalog( + card, f"cards.rag_eval.faithfulness.ragbench.{subset}", overwrite=True + ) diff --git a/prepare/engines/classification/classification_engines.py b/prepare/engines/classification/classification_engines.py index 635b203319..e996a03b1d 100644 --- a/prepare/engines/classification/classification_engines.py +++ b/prepare/engines/classification/classification_engines.py @@ -6,6 +6,7 @@ ) model_names_to_provider = { + "mistral-large-instruct": ["watsonx", "rits"], "llama-3-3-70b-instruct": ["watsonx", "rits"], "llama-3-1-70b-instruct": ["watsonx", "rits"], "gpt-4o": ["open-ai"], diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py index aa418de2d1..ded8353f12 100644 --- a/prepare/metrics/llm_as_judge/rag_judge.py +++ b/prepare/metrics/llm_as_judge/rag_judge.py @@ -102,6 +102,8 @@ def get_prediction_field(metric_type): "llama_3_3_70b_instruct_watsonx": "engines.classification.llama_3_3_70b_instruct_watsonx", "llama_3_3_70b_instruct_rits": "engines.classification.llama_3_3_70b_instruct_rits", "gpt_4o_azure": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "mistral_large_instruct_watsonx": "engines.classification.mistral_large_watsonx", + "mistral_large_instruct_rits": "engines.classification.mistral_large_instruct_2407_rits", generic_engine_label: GenericInferenceEngine(), } diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json new file mode 100644 index 0000000000..ede04fe9b7 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "covidqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json new file mode 100644 index 0000000000..cf123101a1 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "cuad", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json new file mode 100644 index 0000000000..707fa49e11 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "delucionqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json new file mode 100644 index 0000000000..2cdcf49a41 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "emanual", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json new file mode 100644 index 0000000000..ceeb85882f --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "expertqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json new file mode 100644 index 0000000000..d61854f528 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "finqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json new file mode 100644 index 0000000000..5dc19716da --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "hagrid", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json new file mode 100644 index 0000000000..a484e6bb45 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "hotpotqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json new file mode 100644 index 0000000000..d962ba0724 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "msmarco", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json new file mode 100644 index 0000000000..f0012c22f7 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "pubmedqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json new file mode 100644 index 0000000000..35b3623725 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "tatqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json new file mode 100644 index 0000000000..769fedaff2 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "techqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json b/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json new file mode 100644 index 0000000000..cc8530861d --- /dev/null +++ b/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json @@ -0,0 +1,9 @@ +{ + "__type__": "cross_provider_inference_engine", + "model": "mistral-large-instruct", + "logprobs": true, + "max_tokens": 5, + "temperature": 0.0, + "top_logprobs": 5, + "provider": "rits" +} diff --git a/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json b/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json new file mode 100644 index 0000000000..6b1f7f1b0a --- /dev/null +++ b/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json @@ -0,0 +1,9 @@ +{ + "__type__": "cross_provider_inference_engine", + "model": "mistral-large-instruct", + "logprobs": true, + "max_tokens": 5, + "temperature": 0.0, + "top_logprobs": 5, + "provider": "watsonx" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..9830442635 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..dc161455b2 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..71e179025a --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..4d2bd7ab72 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..837ae89c94 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..501588e549 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..5b3dd429ef --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..14ad584b97 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..722315d42b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..5e6aec3c8a --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..f7d2f00aba --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..d5e34f703f --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..d7955a7def --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..8712d4da17 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..6e8898e710 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..c32a744b0b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..9830442635 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..dc161455b2 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..71e179025a --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..4d2bd7ab72 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json new file mode 100644 index 0000000000..5b3dd429ef --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 0000000000..14ad584b97 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index fd9871bb05..1205b6f731 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -2972,6 +2972,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct", "llama-3-2-11b-vision-instruct": "watsonx/meta-llama/llama-3-2-11b-vision-instruct", "llama-3-2-90b-vision-instruct": "watsonx/meta-llama/llama-3-2-90b-vision-instruct", + "mistral-large-instruct": "watsonx/mistralai/mistral-large", }, "watsonx-sdk": { "llama-3-2-11b-vision-instruct": "meta-llama/llama-3-2-11b-vision-instruct", diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index dd1f98b554..7ce8253a50 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -4259,7 +4259,7 @@ class FaithfulnessHHEM(BulkInstanceMetric): batch_size: int = 2 model_name: str = "vectara/hallucination_evaluation_model" prediction_type = str - single_reference_per_prediction = True + # single_reference_per_prediction = True max_context_words = 4096 reduction_map = {"mean": [main_score]} @@ -4291,7 +4291,8 @@ def compute( # treat the references as the contexts and the predictions as answers # concat references - contexts = ["\n".join(refs) for refs in references] + + contexts = ["\n".join([str(r) for r in refs]) for refs in references] contexts = [" ".join(c.split(" ")[: self.max_context_words]) for c in contexts] answers = predictions @@ -4360,7 +4361,7 @@ def compute( from collections import defaultdict query_to_predictions_and_references = defaultdict(lambda: [[], []]) - references = [reference[0] for reference in references] + references = ["\n".join(reference) for reference in references] for reference, pred, inputs_dict in zip(references, predictions, task_data): query = inputs_dict.get("query") query_to_predictions_and_references[query][0].append(pred)