From 3104bed1601ce93df48f26729077f4a3acfe8e20 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Tue, 11 Feb 2025 19:42:15 +0200 Subject: [PATCH 1/8] allow multi reference in hhem metric Signed-off-by: lilacheden --- src/unitxt/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index f86451bf1..4bc15c26d 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -4257,7 +4257,7 @@ class FaithfulnessHHEM(BulkInstanceMetric): batch_size: int = 2 model_name: str = "vectara/hallucination_evaluation_model" prediction_type = str - single_reference_per_prediction = True + # single_reference_per_prediction = True max_context_words = 4096 reduction_map = {"mean": [main_score]} @@ -4358,7 +4358,7 @@ def compute( from collections import defaultdict query_to_predictions_and_references = defaultdict(lambda: [[], []]) - references = [reference[0] for reference in references] + references = ["\n".join(reference) for reference in references] for reference, pred, inputs_dict in zip(references, predictions, task_data): query = inputs_dict.get("query") query_to_predictions_and_references[query][0].append(pred) From 8144ebdd4d4029bacbbb3a53454bc2aad9b83766 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Tue, 11 Feb 2025 19:42:57 +0200 Subject: [PATCH 2/8] add ragbench faithfulness cards Signed-off-by: lilacheden --- prepare/cards/ragbench_faithfulness.py | 41 +++++++++++++++++++ .../faithfulness/ragbench/covidqa.json | 37 +++++++++++++++++ .../rag_eval/faithfulness/ragbench/cuad.json | 37 +++++++++++++++++ .../faithfulness/ragbench/delucionqa.json | 37 +++++++++++++++++ .../faithfulness/ragbench/emanual.json | 37 +++++++++++++++++ .../faithfulness/ragbench/expertqa.json | 37 +++++++++++++++++ .../rag_eval/faithfulness/ragbench/finqa.json | 37 +++++++++++++++++ .../faithfulness/ragbench/hagrid.json | 37 +++++++++++++++++ .../faithfulness/ragbench/hotpotqa.json | 37 +++++++++++++++++ .../faithfulness/ragbench/msmarco.json | 37 +++++++++++++++++ .../faithfulness/ragbench/pubmedqa.json | 37 +++++++++++++++++ .../rag_eval/faithfulness/ragbench/tatqa.json | 37 +++++++++++++++++ .../faithfulness/ragbench/techqa.json | 37 +++++++++++++++++ 13 files changed, 485 insertions(+) create mode 100644 prepare/cards/ragbench_faithfulness.py create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json diff --git a/prepare/cards/ragbench_faithfulness.py b/prepare/cards/ragbench_faithfulness.py new file mode 100644 index 000000000..0e718cd2a --- /dev/null +++ b/prepare/cards/ragbench_faithfulness.py @@ -0,0 +1,41 @@ +from unitxt import add_to_catalog +from unitxt.blocks import ( + LoadHF, + TaskCard, +) +from unitxt.operators import Copy, ExecuteExpression +from unitxt.templates import NullTemplate + +for subset in [ + "covidqa", + "cuad", + "delucionqa", + "emanual", + "expertqa", + "finqa", + "hagrid", + "hotpotqa", + "msmarco", + "pubmedqa", + "tatqa", + "techqa", +]: + card = TaskCard( + loader=LoadHF( + path="rungalileo/ragbench", + name=subset, + split="test" + ), + preprocess_steps=[ + Copy(field="response", to_field="answer"), + Copy(field="documents", to_field="contexts"), + ExecuteExpression(expression="int(adherence_score)", to_field="number_val"), + ExecuteExpression(expression="['yes' if adherence_score else 'no']", to_field="is_faithful"), + ], + task="tasks.rag_eval.faithfulness.binary", + templates={"default": NullTemplate()}, + ) + + add_to_catalog( + card, f"cards.rag_eval.faithfulness.ragbench.{subset}", overwrite=True + ) diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json new file mode 100644 index 000000000..ede04fe9b --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "covidqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json new file mode 100644 index 000000000..cf123101a --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "cuad", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json new file mode 100644 index 000000000..707fa49e1 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "delucionqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json new file mode 100644 index 000000000..2cdcf49a4 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "emanual", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json new file mode 100644 index 000000000..ceeb85882 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "expertqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json new file mode 100644 index 000000000..d61854f52 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "finqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json new file mode 100644 index 000000000..5dc19716d --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "hagrid", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json new file mode 100644 index 000000000..a484e6bb4 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "hotpotqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json new file mode 100644 index 000000000..d962ba072 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "msmarco", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json new file mode 100644 index 000000000..f0012c22f --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "pubmedqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json new file mode 100644 index 000000000..35b362372 --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "tatqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json new file mode 100644 index 000000000..769fedaff --- /dev/null +++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json @@ -0,0 +1,37 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "rungalileo/ragbench", + "name": "techqa", + "split": "test" + }, + "preprocess_steps": [ + { + "__type__": "copy", + "field": "response", + "to_field": "answer" + }, + { + "__type__": "copy", + "field": "documents", + "to_field": "contexts" + }, + { + "__type__": "execute_expression", + "expression": "int(adherence_score)", + "to_field": "number_val" + }, + { + "__type__": "execute_expression", + "expression": "['yes' if adherence_score else 'no']", + "to_field": "is_faithful" + } + ], + "task": "tasks.rag_eval.faithfulness.binary", + "templates": { + "default": { + "__type__": "null_template" + } + } +} From 7e531effa827b0b909e376700cf5280ed0344424 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Tue, 11 Feb 2025 19:44:25 +0200 Subject: [PATCH 3/8] add "mistral-large-instruct" to provider Signed-off-by: lilacheden --- src/unitxt/inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index c9c404e62..e1f6a28c4 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -2960,6 +2960,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct", "llama-3-2-11b-vision-instruct": "watsonx/meta-llama/llama-3-2-11b-vision-instruct", "llama-3-2-90b-vision-instruct": "watsonx/meta-llama/llama-3-2-90b-vision-instruct", + "mistral-large-instruct": "watsonx/mistralai/mistral-large", }, "watsonx-sdk": { "llama-3-2-11b-vision-instruct": "meta-llama/llama-3-2-11b-vision-instruct", From 16ebe425c5bd053baf8a306af59028edaeb83807 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Tue, 11 Feb 2025 19:46:24 +0200 Subject: [PATCH 4/8] add "mistral-large-instruct" classification engines Signed-off-by: lilacheden --- prepare/engines/classification/classification_engines.py | 1 + .../classification/mistral_large_instruct_2407_rits.json | 9 +++++++++ .../engines/classification/mistral_large_watsonx.json | 9 +++++++++ 3 files changed, 19 insertions(+) create mode 100644 src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json create mode 100644 src/unitxt/catalog/engines/classification/mistral_large_watsonx.json diff --git a/prepare/engines/classification/classification_engines.py b/prepare/engines/classification/classification_engines.py index 635b20331..e996a03b1 100644 --- a/prepare/engines/classification/classification_engines.py +++ b/prepare/engines/classification/classification_engines.py @@ -6,6 +6,7 @@ ) model_names_to_provider = { + "mistral-large-instruct": ["watsonx", "rits"], "llama-3-3-70b-instruct": ["watsonx", "rits"], "llama-3-1-70b-instruct": ["watsonx", "rits"], "gpt-4o": ["open-ai"], diff --git a/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json b/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json new file mode 100644 index 000000000..cc8530861 --- /dev/null +++ b/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json @@ -0,0 +1,9 @@ +{ + "__type__": "cross_provider_inference_engine", + "model": "mistral-large-instruct", + "logprobs": true, + "max_tokens": 5, + "temperature": 0.0, + "top_logprobs": 5, + "provider": "rits" +} diff --git a/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json b/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json new file mode 100644 index 000000000..6b1f7f1b0 --- /dev/null +++ b/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json @@ -0,0 +1,9 @@ +{ + "__type__": "cross_provider_inference_engine", + "model": "mistral-large-instruct", + "logprobs": true, + "max_tokens": 5, + "temperature": 0.0, + "top_logprobs": 5, + "provider": "watsonx" +} From 56347a8846676357051bd7e1b8e101861eb9cc8b Mon Sep 17 00:00:00 2001 From: lilacheden Date: Tue, 11 Feb 2025 19:55:41 +0200 Subject: [PATCH 5/8] add rag judges that use mistral-large-instruct Signed-off-by: lilacheden --- prepare/metrics/llm_as_judge/rag_judge.py | 2 ++ .../mistral_large_instruct_rits_judge.json | 13 +++++++++++++ .../mistral_large_instruct_watsonx_judge.json | 13 +++++++++++++ .../mistral_large_instruct_rits_judge.json | 13 +++++++++++++ .../mistral_large_instruct_watsonx_judge.json | 13 +++++++++++++ .../mistral_large_instruct_rits_judge.json | 13 +++++++++++++ .../mistral_large_instruct_watsonx_judge.json | 13 +++++++++++++ .../mistral_large_instruct_rits_judge.json | 13 +++++++++++++ .../mistral_large_instruct_watsonx_judge.json | 13 +++++++++++++ .../mistral_large_instruct_rits_judge.json | 11 +++++++++++ .../mistral_large_instruct_watsonx_judge.json | 11 +++++++++++ .../mistral_large_instruct_rits_judge.json | 11 +++++++++++ .../mistral_large_instruct_watsonx_judge.json | 11 +++++++++++ .../mistral_large_instruct_rits_judge.json | 11 +++++++++++ .../mistral_large_instruct_watsonx_judge.json | 11 +++++++++++ .../mistral_large_instruct_rits_judge.json | 11 +++++++++++ .../mistral_large_instruct_watsonx_judge.json | 11 +++++++++++ .../mistral_large_instruct_rits_judge.json | 13 +++++++++++++ .../mistral_large_instruct_watsonx_judge.json | 13 +++++++++++++ .../mistral_large_instruct_rits_judge.json | 13 +++++++++++++ .../mistral_large_instruct_watsonx_judge.json | 13 +++++++++++++ .../mistral_large_instruct_rits_judge.json | 13 +++++++++++++ .../mistral_large_instruct_watsonx_judge.json | 13 +++++++++++++ 23 files changed, 272 insertions(+) create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py index aa418de2d..ded8353f1 100644 --- a/prepare/metrics/llm_as_judge/rag_judge.py +++ b/prepare/metrics/llm_as_judge/rag_judge.py @@ -102,6 +102,8 @@ def get_prediction_field(metric_type): "llama_3_3_70b_instruct_watsonx": "engines.classification.llama_3_3_70b_instruct_watsonx", "llama_3_3_70b_instruct_rits": "engines.classification.llama_3_3_70b_instruct_rits", "gpt_4o_azure": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "mistral_large_instruct_watsonx": "engines.classification.mistral_large_watsonx", + "mistral_large_instruct_rits": "engines.classification.mistral_large_instruct_2407_rits", generic_engine_label: GenericInferenceEngine(), } diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..983044263 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..dc161455b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..71e179025 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..4d2bd7ab7 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..837ae89c9 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..501588e54 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..5b3dd429e --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..14ad584b9 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..722315d42 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..5e6aec3c8 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..f7d2f00ab --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..d5e34f703 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..d7955a7de --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..8712d4da1 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..6e8898e71 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..c32a744b0 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..983044263 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..dc161455b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..71e179025 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..4d2bd7ab7 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json new file mode 100644 index 000000000..5b3dd429e --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_instruct_2407_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json new file mode 100644 index 000000000..14ad584b9 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.mistral_large_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} From 0abc51808a761f98f37e49f56a348e46c381e917 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Thu, 13 Feb 2025 13:10:59 +0200 Subject: [PATCH 6/8] fix hhem multi reference Signed-off-by: lilacheden --- src/unitxt/metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 8aa10b038..8d108c70c 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -4360,7 +4360,8 @@ def compute( from collections import defaultdict query_to_predictions_and_references = defaultdict(lambda: [[], []]) - references = ["\n".join(reference) for reference in references] + references = [str(r) for r in references] + references = ["\n".join(references)] for reference, pred, inputs_dict in zip(references, predictions, task_data): query = inputs_dict.get("query") query_to_predictions_and_references[query][0].append(pred) From 1bd0a0a4ec59badfe97756b3f3f5163db6beb880 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Thu, 13 Feb 2025 16:16:18 +0200 Subject: [PATCH 7/8] Revert "fix hhem multi reference" This reverts commit 0abc51808a761f98f37e49f56a348e46c381e917. --- src/unitxt/metrics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 8d108c70c..8aa10b038 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -4360,8 +4360,7 @@ def compute( from collections import defaultdict query_to_predictions_and_references = defaultdict(lambda: [[], []]) - references = [str(r) for r in references] - references = ["\n".join(references)] + references = ["\n".join(reference) for reference in references] for reference, pred, inputs_dict in zip(references, predictions, task_data): query = inputs_dict.get("query") query_to_predictions_and_references[query][0].append(pred) From ff9fa11ff8a93f1c31ca408e59d5fa494bf542ec Mon Sep 17 00:00:00 2001 From: lilacheden Date: Thu, 13 Feb 2025 16:20:10 +0200 Subject: [PATCH 8/8] fix hhem multi reference Signed-off-by: lilacheden --- src/unitxt/metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 8aa10b038..7ce8253a5 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -4291,7 +4291,8 @@ def compute( # treat the references as the contexts and the predictions as answers # concat references - contexts = ["\n".join(refs) for refs in references] + + contexts = ["\n".join([str(r) for r in refs]) for refs in references] contexts = [" ".join(c.split(" ")[: self.max_context_words]) for c in contexts] answers = predictions