* fix CI -> GH_PAT and list_contexts issue

* add instructor for enforcing json llm outputs in deepeval * silent deepeval outputs * add json deepeval-cache parser * set two deepeval metrics as notimplemented -protobuf error Signed-off-by: Jack Luar <[email protected]>
The-OpenROAD-Project · Nov 10, 2024 · 76f12a6 · 76f12a6 · luarss · Nov 10, 2024
1 parent 74c8730
commit 76f12a6
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 56 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -40,7 +40,7 @@ jobs:
     - name: Create commit comment
       uses: peter-evans/commit-comment@v3
       with:
-        token: ${{ secrets.GH_PATH }}
+        token: ${{ secrets.GH_PAT }}
         body-path: evaluation/auto_evaluation/llm_tests_output.txt
     - name: Teardown
       if: always()

diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py
 
 EXPOSE 8000
 
-CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
diff --git a/backend/src/api/routers/graphs.py b/backend/src/api/routers/graphs.py
@@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse:
         tool_index = 1
         for tool in tools:
             urls.extend(list(output[tool_index].values())[0]["urls"])
-            context.extend(list(set(list(output[tool_index].values())[0]["context"])))
+            context.append(list(output[tool_index].values())[0]["context"])
             tool_index += 1
     else:
         llm_response = "LLM response extraction failed"

diff --git a/backend/src/tools/format_docs.py b/backend/src/tools/format_docs.py
@@ -5,7 +5,7 @@
 
 def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
     doc_text = ""
-    doc_texts = ""
+    doc_texts = []
     doc_urls = []
     doc_srcs = []
 
@@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
                 doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}"
             else:
                 doc_text = doc.page_content
+            doc_texts.append(doc_text)
 
         if "url" in doc.metadata:
             doc_urls.append(doc.metadata["url"])
+
+    doc_output = "\n\n -------------------------- \n\n".join(doc_texts)
 
-        doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}"
-
-    return doc_texts, doc_srcs, doc_urls
+    return doc_output, doc_srcs, doc_urls
diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py
@@ -1,4 +1,5 @@
 import csv
+import json
 from typing import Any
 
 
@@ -23,3 +24,33 @@ def write_data(results_list: list[dict[str, Any]], results_path: str):
         for result in results_list:
             writer.writerow([result[key] for key in keys])
     print(f"Results written to {results_path}")
+
+
+def read_deepeval_cache():
+    metric_scores = {
+        "Contextual Precision": [],
+        "Contextual Recall": [],
+        "Hallucination": [],
+    }
+    metric_passes = {
+        "Contextual Precision": [],
+        "Contextual Recall": [],
+        "Hallucination": [],
+    }
+    with open(".deepeval-cache.json") as f:
+        results = json.load(f)
+    for _, value in results["test_cases_lookup_map"].items():
+        for metric in value["cached_metrics_data"]:
+            metric_scores[metric["metric_data"]["name"]].append(
+                metric["metric_data"]["score"]
+            )
+            metric_passes[metric["metric_data"]["name"]].append(
+                metric["metric_data"]["success"]
+            )
+
+    print("Metric Scores: ", metric_scores)
+    print("Metric Passes: ", metric_passes)
+
+
+if __name__ == "__main__":
+    read_deepeval_cache()
diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py
@@ -4,6 +4,7 @@
 """
 
 import argparse
+import time
 import requests
 import os
 
@@ -15,8 +16,6 @@
 from auto_evaluation.src.metrics.retrieval import (
     make_contextual_precision_metric,
     make_contextual_recall_metric,
-    make_contextual_relevancy_metric,
-    make_faithfulness_metric,
     make_hallucination_metric,
 )
 from auto_evaluation.dataset import hf_pull, preprocess
@@ -65,57 +64,42 @@ def evaluate(self, retriever: str):
         response_times = []
 
         # metrics
-        precision, recall, relevancy, faithfulness, hallucination = (
+        precision, recall, hallucination = (
             make_contextual_precision_metric(self.eval_model),
             make_contextual_recall_metric(self.eval_model),
-            make_contextual_relevancy_metric(self.eval_model),
-            make_faithfulness_metric(self.eval_model),
             make_hallucination_metric(self.eval_model),
         )
 
         # retrieval test cases
         for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
-            if i < 20:
+            if i >= 1:
                 continue
             question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
             response, response_time = self.query(retriever, question)
             response_text = response["response"]
             context = response["context"]
+            context_list = context[0].split("--------------------------")
 
+            # works for: precision, recall, hallucination
             retrieval_tc = LLMTestCase(
                 input=question,
                 actual_output=response_text,
                 expected_output=ground_truth,
-                context=context,
-                retrieval_context=context,
+                context=context_list,
+                retrieval_context=context_list,
             )
             retrieval_tcs.append(retrieval_tc)
             response_times.append(response_time)
 
         # parallel evaluate
         evaluate(
             retrieval_tcs,
-            [precision, recall, relevancy, faithfulness, hallucination],
+            [precision, recall, hallucination],
+            print_results=False,
         )
 
-        #     result = {
-        #         "question": f"{i + 1}. {question}",
-        #         "ground_truth": ground_truth,
-        #         "retriever_type": retriever,
-        #         "response_time": response_time,
-        #         "response_text": response_text,
-        #         "tool": retriever,
-        #         "contextual_precision": precision.score,
-        #         "contextual_recall": recall.score,
-        #         "contextual_relevancy": relevancy.score,
-        #         "faithfulness": faithfulness.score,
-        #         "hallucination": hallucination.score,
-        #     }
-        #     print(result)
-        #     overall.append(result)
-
-        # Write to log file
-        # preprocess.write_data(overall, log_file)
+        # parse deepeval results
+        preprocess.read_deepeval_cache()
 
     def query(self, retriever: str, query: str) -> tuple[dict, float]:
         """
@@ -127,8 +111,9 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]:
             if retriever != "agent-retriever-reranker"
             else f"{self.reranker_base_url}/{endpoint}"
         )
-        payload = {"query": query, "list_context": True, "list_sources": True}
+        payload = {"query": query, "list_context": True, "list_sources": False}
         try:
+            time.sleep(5)
             response = requests.post(url, json=payload)
             return response.json(), response.elapsed.total_seconds() * 1000
         except Exception as e:

diff --git a/evaluation/auto_evaluation/llm_tests.sh b/evaluation/auto_evaluation/llm_tests.sh
@@ -2,7 +2,6 @@
 
 retrievers=(
     "agent-retriever" \
-    "ensemble" \
 )
 
 echo "==================================="
@@ -13,6 +12,5 @@ for retriever in "${retrievers[@]}" ; do
        --base_url http://localhost:8000 \
        --dataset ./dataset/EDA_Corpus_100_Question.csv \
        --retriever $retriever
-    echo "==> Done"
 done
 echo "==================================="
diff --git a/evaluation/auto_evaluation/src/metrics/retrieval.py b/evaluation/auto_evaluation/src/metrics/retrieval.py
@@ -35,18 +35,14 @@ def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMet
 def make_contextual_relevancy_metric(
     model: DeepEvalBaseLLM,
 ) -> ContextualRelevancyMetric:
-    return ContextualRelevancyMetric(
-        threshold=RELEVANCY_THRESHOLD,
-        model=model,
-        include_reason=True,
+    raise NotImplementedError(
+        "ContextualRelevancyMetric is not implemented due to protobuf incompatability"
     )
 
 
 def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
-    return FaithfulnessMetric(
-        threshold=FAITHFULNESS_THRESHOLD,
-        model=model,
-        include_reason=True,
+    raise NotImplementedError(
+        "FaithfulnessMetric is not implemented due to protobuf incompatability"
     )
 
 

diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py
@@ -3,10 +3,18 @@
 Custom DeepEvalLLM wrapper.
 """
 
+import instructor
+
 from typing import Any
 
-from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
+# from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
+from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory  # type: ignore
 from deepeval.models.base_model import DeepEvalBaseLLM
+from pydantic import BaseModel
+
+
+class Response(BaseModel):
+    content: str
 
 
 class GoogleVertexAILangChain(DeepEvalBaseLLM):
@@ -26,17 +34,43 @@ def load_model(self, *args, **kwargs):
             HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
         }
 
-        return ChatVertexAI(
+        return GenerativeModel(
             model_name=self.model_name,
             safety_settings=safety_settings,
         )
 
-    def generate(self, prompt: str) -> Any:
-        return self.model.invoke(prompt).content
+    def generate(self, prompt: str, schema: BaseModel) -> Any:
+        instructor_client = instructor.from_vertexai(
+            client=self.load_model(),
+            mode=instructor.Mode.VERTEXAI_TOOLS,
+        )
+        resp = instructor_client.messages.create(  # type: ignore
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            response_model=schema,
+        )
+        return resp
 
-    async def a_generate(self, prompt: str) -> Any:
-        response = await self.model.ainvoke(prompt)
-        return response.content
+    async def a_generate(self, prompt: str, schema: BaseModel) -> Any:
+        instructor_client = instructor.from_vertexai(
+            client=self.load_model(),
+            mode=instructor.Mode.VERTEXAI_TOOLS,
+            _async=True,
+        )
+        resp = await instructor_client.messages.create(  # type: ignore
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            response_model=schema,
+        )
+        return resp
 
     def get_model_name(self):
         return self.model_name
@@ -46,21 +80,22 @@ def main():
     model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
     prompt = "Write me a joke"
     print(f"Prompt: {prompt}")
-    response = model.generate(prompt)
+    response = model.generate(prompt, schema=Response)
     print(f"Response: {response}")
 
 
 async def main_async():
     model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
     prompt = "Write me a joke"
     print(f"Prompt: {prompt}")
-    response = await model.a_generate(prompt)
+    response = await model.a_generate(prompt, Response)
     print(f"Response: {response}")
 
 
 if __name__ == "__main__":
+    import asyncio
     from dotenv import load_dotenv
 
     load_dotenv()
-    main()
-    # asyncio.run(main_async())
+    # main()
+    asyncio.run(main_async())
diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt
@@ -12,3 +12,4 @@ deepeval==1.4.9
 langchain-google-vertexai==2.0.6
 asyncio==3.4.3
 huggingface-hub==0.26.2
+instructor[vertexai]==1.5.2
Original file line number	Diff line number	Diff line change
Expand Up		@@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py

		EXPOSE 8000

		CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
		CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]