From 3104bed1601ce93df48f26729077f4a3acfe8e20 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Tue, 11 Feb 2025 19:42:15 +0200
Subject: [PATCH 1/8] allow multi reference in hhem metric Signed-off-by:
 lilacheden <lilach.edel@gmail.com>

---
 src/unitxt/metrics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index f86451bf1..4bc15c26d 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -4257,7 +4257,7 @@ class FaithfulnessHHEM(BulkInstanceMetric):
     batch_size: int = 2
     model_name: str = "vectara/hallucination_evaluation_model"
     prediction_type = str
-    single_reference_per_prediction = True
+   # single_reference_per_prediction = True
     max_context_words = 4096
     reduction_map = {"mean": [main_score]}
 
@@ -4358,7 +4358,7 @@ def compute(
         from collections import defaultdict
 
         query_to_predictions_and_references = defaultdict(lambda: [[], []])
-        references = [reference[0] for reference in references]
+        references = ["\n".join(reference) for reference in references]
         for reference, pred, inputs_dict in zip(references, predictions, task_data):
             query = inputs_dict.get("query")
             query_to_predictions_and_references[query][0].append(pred)

From 8144ebdd4d4029bacbbb3a53454bc2aad9b83766 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Tue, 11 Feb 2025 19:42:57 +0200
Subject: [PATCH 2/8] add ragbench faithfulness cards Signed-off-by: lilacheden
 <lilach.edel@gmail.com>

---
 prepare/cards/ragbench_faithfulness.py        | 41 +++++++++++++++++++
 .../faithfulness/ragbench/covidqa.json        | 37 +++++++++++++++++
 .../rag_eval/faithfulness/ragbench/cuad.json  | 37 +++++++++++++++++
 .../faithfulness/ragbench/delucionqa.json     | 37 +++++++++++++++++
 .../faithfulness/ragbench/emanual.json        | 37 +++++++++++++++++
 .../faithfulness/ragbench/expertqa.json       | 37 +++++++++++++++++
 .../rag_eval/faithfulness/ragbench/finqa.json | 37 +++++++++++++++++
 .../faithfulness/ragbench/hagrid.json         | 37 +++++++++++++++++
 .../faithfulness/ragbench/hotpotqa.json       | 37 +++++++++++++++++
 .../faithfulness/ragbench/msmarco.json        | 37 +++++++++++++++++
 .../faithfulness/ragbench/pubmedqa.json       | 37 +++++++++++++++++
 .../rag_eval/faithfulness/ragbench/tatqa.json | 37 +++++++++++++++++
 .../faithfulness/ragbench/techqa.json         | 37 +++++++++++++++++
 13 files changed, 485 insertions(+)
 create mode 100644 prepare/cards/ragbench_faithfulness.py
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json
 create mode 100644 src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json

diff --git a/prepare/cards/ragbench_faithfulness.py b/prepare/cards/ragbench_faithfulness.py
new file mode 100644
index 000000000..0e718cd2a
--- /dev/null
+++ b/prepare/cards/ragbench_faithfulness.py
@@ -0,0 +1,41 @@
+from unitxt import add_to_catalog
+from unitxt.blocks import (
+    LoadHF,
+    TaskCard,
+)
+from unitxt.operators import Copy, ExecuteExpression
+from unitxt.templates import NullTemplate
+
+for subset in [
+    "covidqa",
+    "cuad",
+    "delucionqa",
+    "emanual",
+    "expertqa",
+    "finqa",
+    "hagrid",
+    "hotpotqa",
+    "msmarco",
+    "pubmedqa",
+    "tatqa",
+    "techqa",
+]:
+    card = TaskCard(
+        loader=LoadHF(
+            path="rungalileo/ragbench",
+            name=subset,
+            split="test"
+        ),
+        preprocess_steps=[
+            Copy(field="response", to_field="answer"),
+            Copy(field="documents", to_field="contexts"),
+            ExecuteExpression(expression="int(adherence_score)", to_field="number_val"),
+            ExecuteExpression(expression="['yes' if adherence_score else 'no']", to_field="is_faithful"),
+        ],
+        task="tasks.rag_eval.faithfulness.binary",
+        templates={"default": NullTemplate()},
+    )
+
+    add_to_catalog(
+        card, f"cards.rag_eval.faithfulness.ragbench.{subset}", overwrite=True
+    )
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json
new file mode 100644
index 000000000..ede04fe9b
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "covidqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json
new file mode 100644
index 000000000..cf123101a
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "cuad",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json
new file mode 100644
index 000000000..707fa49e1
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "delucionqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json
new file mode 100644
index 000000000..2cdcf49a4
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "emanual",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json
new file mode 100644
index 000000000..ceeb85882
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "expertqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json
new file mode 100644
index 000000000..d61854f52
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "finqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json
new file mode 100644
index 000000000..5dc19716d
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "hagrid",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json
new file mode 100644
index 000000000..a484e6bb4
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "hotpotqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json
new file mode 100644
index 000000000..d962ba072
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/msmarco.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "msmarco",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json
new file mode 100644
index 000000000..f0012c22f
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/pubmedqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "pubmedqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json
new file mode 100644
index 000000000..35b362372
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/tatqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "tatqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json
new file mode 100644
index 000000000..769fedaff
--- /dev/null
+++ b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/techqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "techqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}

From 7e531effa827b0b909e376700cf5280ed0344424 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Tue, 11 Feb 2025 19:44:25 +0200
Subject: [PATCH 3/8] add "mistral-large-instruct" to provider Signed-off-by:
 lilacheden <lilach.edel@gmail.com>

---
 src/unitxt/inference.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index c9c404e62..e1f6a28c4 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -2960,6 +2960,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct",
             "llama-3-2-11b-vision-instruct": "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
             "llama-3-2-90b-vision-instruct": "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
+            "mistral-large-instruct": "watsonx/mistralai/mistral-large",
         },
         "watsonx-sdk": {
             "llama-3-2-11b-vision-instruct": "meta-llama/llama-3-2-11b-vision-instruct",

From 16ebe425c5bd053baf8a306af59028edaeb83807 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Tue, 11 Feb 2025 19:46:24 +0200
Subject: [PATCH 4/8] add "mistral-large-instruct" classification engines
 Signed-off-by: lilacheden <lilach.edel@gmail.com>

---
 prepare/engines/classification/classification_engines.py | 1 +
 .../classification/mistral_large_instruct_2407_rits.json | 9 +++++++++
 .../engines/classification/mistral_large_watsonx.json    | 9 +++++++++
 3 files changed, 19 insertions(+)
 create mode 100644 src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json
 create mode 100644 src/unitxt/catalog/engines/classification/mistral_large_watsonx.json

diff --git a/prepare/engines/classification/classification_engines.py b/prepare/engines/classification/classification_engines.py
index 635b20331..e996a03b1 100644
--- a/prepare/engines/classification/classification_engines.py
+++ b/prepare/engines/classification/classification_engines.py
@@ -6,6 +6,7 @@
 )
 
 model_names_to_provider = {
+    "mistral-large-instruct": ["watsonx", "rits"],
     "llama-3-3-70b-instruct": ["watsonx", "rits"],
     "llama-3-1-70b-instruct": ["watsonx", "rits"],
     "gpt-4o": ["open-ai"],
diff --git a/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json b/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json
new file mode 100644
index 000000000..cc8530861
--- /dev/null
+++ b/src/unitxt/catalog/engines/classification/mistral_large_instruct_2407_rits.json
@@ -0,0 +1,9 @@
+{
+    "__type__": "cross_provider_inference_engine",
+    "model": "mistral-large-instruct",
+    "logprobs": true,
+    "max_tokens": 5,
+    "temperature": 0.0,
+    "top_logprobs": 5,
+    "provider": "rits"
+}
diff --git a/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json b/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json
new file mode 100644
index 000000000..6b1f7f1b0
--- /dev/null
+++ b/src/unitxt/catalog/engines/classification/mistral_large_watsonx.json
@@ -0,0 +1,9 @@
+{
+    "__type__": "cross_provider_inference_engine",
+    "model": "mistral-large-instruct",
+    "logprobs": true,
+    "max_tokens": 5,
+    "temperature": 0.0,
+    "top_logprobs": 5,
+    "provider": "watsonx"
+}

From 56347a8846676357051bd7e1b8e101861eb9cc8b Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Tue, 11 Feb 2025 19:55:41 +0200
Subject: [PATCH 5/8] add rag judges that use mistral-large-instruct
 Signed-off-by: lilacheden <lilach.edel@gmail.com>

---
 prepare/metrics/llm_as_judge/rag_judge.py           |  2 ++
 .../mistral_large_instruct_rits_judge.json          | 13 +++++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 13 +++++++++++++
 .../mistral_large_instruct_rits_judge.json          | 13 +++++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 13 +++++++++++++
 .../mistral_large_instruct_rits_judge.json          | 13 +++++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 13 +++++++++++++
 .../mistral_large_instruct_rits_judge.json          | 13 +++++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 13 +++++++++++++
 .../mistral_large_instruct_rits_judge.json          | 11 +++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 11 +++++++++++
 .../mistral_large_instruct_rits_judge.json          | 11 +++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 11 +++++++++++
 .../mistral_large_instruct_rits_judge.json          | 11 +++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 11 +++++++++++
 .../mistral_large_instruct_rits_judge.json          | 11 +++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 11 +++++++++++
 .../mistral_large_instruct_rits_judge.json          | 13 +++++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 13 +++++++++++++
 .../mistral_large_instruct_rits_judge.json          | 13 +++++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 13 +++++++++++++
 .../mistral_large_instruct_rits_judge.json          | 13 +++++++++++++
 .../mistral_large_instruct_watsonx_judge.json       | 13 +++++++++++++
 23 files changed, 272 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json
 create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json

diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py
index aa418de2d..ded8353f1 100644
--- a/prepare/metrics/llm_as_judge/rag_judge.py
+++ b/prepare/metrics/llm_as_judge/rag_judge.py
@@ -102,6 +102,8 @@ def get_prediction_field(metric_type):
     "llama_3_3_70b_instruct_watsonx": "engines.classification.llama_3_3_70b_instruct_watsonx",
     "llama_3_3_70b_instruct_rits": "engines.classification.llama_3_3_70b_instruct_rits",
     "gpt_4o_azure": "engines.classification.gpt_4o_2024_08_06_azure_openai",
+    "mistral_large_instruct_watsonx": "engines.classification.mistral_large_watsonx",
+    "mistral_large_instruct_rits": "engines.classification.mistral_large_instruct_2407_rits",
     generic_engine_label: GenericInferenceEngine(),
 }
 
diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..983044263
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..dc161455b
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..71e179025
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..4d2bd7ab7
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..837ae89c9
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_judge",
+    "prediction_field": "contexts",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..501588e54
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_judge",
+    "prediction_field": "contexts",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..5b3dd429e
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..14ad584b9
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..722315d42
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {}
+}
diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..5e6aec3c8
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {}
+}
diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..f7d2f00ab
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {}
+}
diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..d5e34f703
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {}
+}
diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..d7955a7de
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_judge",
+    "prediction_field": "contexts",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {}
+}
diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..8712d4da1
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric",
+    "task": "tasks.rag_eval.context_relevance.binary",
+    "format": null,
+    "main_score": "context_relevance_judge",
+    "prediction_field": "contexts",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {}
+}
diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..6e8898e71
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {}
+}
diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..c32a744b0
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,11 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {}
+}
diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..983044263
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..dc161455b
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric",
+    "task": "tasks.rag_eval.answer_correctness.binary",
+    "format": null,
+    "main_score": "answer_correctness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..71e179025
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..4d2bd7ab7
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric",
+    "task": "tasks.rag_eval.answer_relevance.binary",
+    "format": null,
+    "main_score": "answer_relevance_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json
new file mode 100644
index 000000000..5b3dd429e
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_rits_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_instruct_2407_rits",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json
new file mode 100644
index 000000000..14ad584b9
--- /dev/null
+++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/mistral_large_instruct_watsonx_judge.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "task_based_ll_mas_judge",
+    "inference_model": "engines.classification.mistral_large_watsonx",
+    "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal",
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "format": null,
+    "main_score": "faithfulness_judge",
+    "prediction_field": "answer",
+    "infer_log_probs": false,
+    "judge_to_generator_fields_mapping": {
+        "ground_truths": "reference_answers"
+    }
+}

From 0abc51808a761f98f37e49f56a348e46c381e917 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Thu, 13 Feb 2025 13:10:59 +0200
Subject: [PATCH 6/8] fix hhem multi reference Signed-off-by: lilacheden
 <lilach.edel@gmail.com>

---
 src/unitxt/metrics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 8aa10b038..8d108c70c 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -4360,7 +4360,8 @@ def compute(
         from collections import defaultdict
 
         query_to_predictions_and_references = defaultdict(lambda: [[], []])
-        references = ["\n".join(reference) for reference in references]
+        references = [str(r) for r in references]
+        references = ["\n".join(references)]
         for reference, pred, inputs_dict in zip(references, predictions, task_data):
             query = inputs_dict.get("query")
             query_to_predictions_and_references[query][0].append(pred)

From 1bd0a0a4ec59badfe97756b3f3f5163db6beb880 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Thu, 13 Feb 2025 16:16:18 +0200
Subject: [PATCH 7/8] Revert "fix hhem multi reference"

This reverts commit 0abc51808a761f98f37e49f56a348e46c381e917.
---
 src/unitxt/metrics.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 8d108c70c..8aa10b038 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -4360,8 +4360,7 @@ def compute(
         from collections import defaultdict
 
         query_to_predictions_and_references = defaultdict(lambda: [[], []])
-        references = [str(r) for r in references]
-        references = ["\n".join(references)]
+        references = ["\n".join(reference) for reference in references]
         for reference, pred, inputs_dict in zip(references, predictions, task_data):
             query = inputs_dict.get("query")
             query_to_predictions_and_references[query][0].append(pred)

From ff9fa11ff8a93f1c31ca408e59d5fa494bf542ec Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Thu, 13 Feb 2025 16:20:10 +0200
Subject: [PATCH 8/8] fix hhem multi reference Signed-off-by: lilacheden
 <lilach.edel@gmail.com>

---
 src/unitxt/metrics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 8aa10b038..7ce8253a5 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -4291,7 +4291,8 @@ def compute(
 
         # treat the references as the contexts and the predictions as answers
         # concat references
-        contexts = ["\n".join(refs) for refs in references]
+
+        contexts = ["\n".join([str(r) for r in refs]) for refs in references]
         contexts = [" ".join(c.split(" ")[: self.max_context_words]) for c in contexts]
         answers = predictions