IBM · lilacheden · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/prepare/cards/ragbench_faithfulness.py b/prepare/cards/ragbench_faithfulness.py
@@ -0,0 +1,41 @@
+from unitxt import add_to_catalog
+from unitxt.blocks import (
+    LoadHF,
+    TaskCard,
+)
+from unitxt.operators import Copy, ExecuteExpression
+from unitxt.templates import NullTemplate
+
+for subset in [
+    "covidqa",
+    "cuad",
+    "delucionqa",
+    "emanual",
+    "expertqa",
+    "finqa",
+    "hagrid",
+    "hotpotqa",
+    "msmarco",
+    "pubmedqa",
+    "tatqa",
+    "techqa",
+]:
+    card = TaskCard(
+        loader=LoadHF(
+            path="rungalileo/ragbench",
+            name=subset,
+            split="test"
+        ),
+        preprocess_steps=[
+            Copy(field="response", to_field="answer"),
+            Copy(field="documents", to_field="contexts"),
+            ExecuteExpression(expression="int(adherence_score)", to_field="number_val"),
+            ExecuteExpression(expression="['yes' if adherence_score else 'no']", to_field="is_faithful"),
+        ],
+        task="tasks.rag_eval.faithfulness.binary",
+        templates={"default": NullTemplate()},
+    )
+
+    add_to_catalog(
+        card, f"cards.rag_eval.faithfulness.ragbench.{subset}", overwrite=True
+    )
diff --git a/prepare/engines/classification/classification_engines.py b/prepare/engines/classification/classification_engines.py
@@ -6,6 +6,7 @@
 )
 
 model_names_to_provider = {
+    "mistral-large-instruct": ["watsonx", "rits"],
     "llama-3-3-70b-instruct": ["watsonx", "rits"],
     "llama-3-1-70b-instruct": ["watsonx", "rits"],
     "gpt-4o": ["open-ai"],

diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py
@@ -102,6 +102,8 @@ def get_prediction_field(metric_type):
     "llama_3_3_70b_instruct_watsonx": "engines.classification.llama_3_3_70b_instruct_watsonx",
     "llama_3_3_70b_instruct_rits": "engines.classification.llama_3_3_70b_instruct_rits",
     "gpt_4o_azure": "engines.classification.gpt_4o_2024_08_06_azure_openai",
+    "mistral_large_instruct_watsonx": "engines.classification.mistral_large_watsonx",
+    "mistral_large_instruct_rits": "engines.classification.mistral_large_instruct_2407_rits",
     generic_engine_label: GenericInferenceEngine(),
 }
 

diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/covidqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "covidqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/cuad.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "cuad",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/delucionqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "delucionqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/emanual.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "emanual",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/expertqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "expertqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/finqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "finqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hagrid.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "hagrid",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json b/src/unitxt/catalog/cards/rag_eval/faithfulness/ragbench/hotpotqa.json
@@ -0,0 +1,37 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "rungalileo/ragbench",
+        "name": "hotpotqa",
+        "split": "test"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "copy",
+            "field": "response",
+            "to_field": "answer"
+        },
+        {
+            "__type__": "copy",
+            "field": "documents",
+            "to_field": "contexts"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "int(adherence_score)",
+            "to_field": "number_val"
+        },
+        {
+            "__type__": "execute_expression",
+            "expression": "['yes' if adherence_score else 'no']",
+            "to_field": "is_faithful"
+        }
+    ],
+    "task": "tasks.rag_eval.faithfulness.binary",
+    "templates": {
+        "default": {
+            "__type__": "null_template"
+        }
+    }
+}