Skip to content

Commit

Permalink
* fix CI -> GH_PAT and list_contexts issue
Browse files Browse the repository at this point in the history
* add instructor for enforcing json llm outputs in deepeval
* silent deepeval outputs
* add json deepeval-cache parser
* set two deepeval metrics as notimplemented -protobuf error

Signed-off-by: Jack Luar <[email protected]>
  • Loading branch information
luarss committed Nov 10, 2024
1 parent 74c8730 commit 76f12a6
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 56 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
- name: Create commit comment
uses: peter-evans/commit-comment@v3
with:
token: ${{ secrets.GH_PATH }}
token: ${{ secrets.GH_PAT }}
body-path: evaluation/auto_evaluation/llm_tests_output.txt
- name: Teardown
if: always()
Expand Down
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py

EXPOSE 8000

CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
2 changes: 1 addition & 1 deletion backend/src/api/routers/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse:
tool_index = 1
for tool in tools:
urls.extend(list(output[tool_index].values())[0]["urls"])
context.extend(list(set(list(output[tool_index].values())[0]["context"])))
context.append(list(output[tool_index].values())[0]["context"])
tool_index += 1
else:
llm_response = "LLM response extraction failed"
Expand Down
9 changes: 5 additions & 4 deletions backend/src/tools/format_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
doc_text = ""
doc_texts = ""
doc_texts = []
doc_urls = []
doc_srcs = []

Expand All @@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}"
else:
doc_text = doc.page_content
doc_texts.append(doc_text)

if "url" in doc.metadata:
doc_urls.append(doc.metadata["url"])

doc_output = "\n\n -------------------------- \n\n".join(doc_texts)

doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}"

return doc_texts, doc_srcs, doc_urls
return doc_output, doc_srcs, doc_urls
31 changes: 31 additions & 0 deletions evaluation/auto_evaluation/dataset/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import csv
import json
from typing import Any


Expand All @@ -23,3 +24,33 @@ def write_data(results_list: list[dict[str, Any]], results_path: str):
for result in results_list:
writer.writerow([result[key] for key in keys])
print(f"Results written to {results_path}")


def read_deepeval_cache():
metric_scores = {
"Contextual Precision": [],
"Contextual Recall": [],
"Hallucination": [],
}
metric_passes = {
"Contextual Precision": [],
"Contextual Recall": [],
"Hallucination": [],
}
with open(".deepeval-cache.json") as f:
results = json.load(f)
for _, value in results["test_cases_lookup_map"].items():
for metric in value["cached_metrics_data"]:
metric_scores[metric["metric_data"]["name"]].append(
metric["metric_data"]["score"]
)
metric_passes[metric["metric_data"]["name"]].append(
metric["metric_data"]["success"]
)

print("Metric Scores: ", metric_scores)
print("Metric Passes: ", metric_passes)


if __name__ == "__main__":
read_deepeval_cache()
41 changes: 13 additions & 28 deletions evaluation/auto_evaluation/eval_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import argparse
import time
import requests
import os

Expand All @@ -15,8 +16,6 @@
from auto_evaluation.src.metrics.retrieval import (
make_contextual_precision_metric,
make_contextual_recall_metric,
make_contextual_relevancy_metric,
make_faithfulness_metric,
make_hallucination_metric,
)
from auto_evaluation.dataset import hf_pull, preprocess
Expand Down Expand Up @@ -65,57 +64,42 @@ def evaluate(self, retriever: str):
response_times = []

# metrics
precision, recall, relevancy, faithfulness, hallucination = (
precision, recall, hallucination = (
make_contextual_precision_metric(self.eval_model),
make_contextual_recall_metric(self.eval_model),
make_contextual_relevancy_metric(self.eval_model),
make_faithfulness_metric(self.eval_model),
make_hallucination_metric(self.eval_model),
)

# retrieval test cases
for i, qa_pair in enumerate(tqdm(self.qns, desc="Evaluating")):
if i < 20:
if i >= 1:
continue
question, ground_truth = qa_pair["question"], qa_pair["ground_truth"]
response, response_time = self.query(retriever, question)
response_text = response["response"]
context = response["context"]
context_list = context[0].split("--------------------------")

# works for: precision, recall, hallucination
retrieval_tc = LLMTestCase(
input=question,
actual_output=response_text,
expected_output=ground_truth,
context=context,
retrieval_context=context,
context=context_list,
retrieval_context=context_list,
)
retrieval_tcs.append(retrieval_tc)
response_times.append(response_time)

# parallel evaluate
evaluate(
retrieval_tcs,
[precision, recall, relevancy, faithfulness, hallucination],
[precision, recall, hallucination],
print_results=False,
)

# result = {
# "question": f"{i + 1}. {question}",
# "ground_truth": ground_truth,
# "retriever_type": retriever,
# "response_time": response_time,
# "response_text": response_text,
# "tool": retriever,
# "contextual_precision": precision.score,
# "contextual_recall": recall.score,
# "contextual_relevancy": relevancy.score,
# "faithfulness": faithfulness.score,
# "hallucination": hallucination.score,
# }
# print(result)
# overall.append(result)

# Write to log file
# preprocess.write_data(overall, log_file)
# parse deepeval results
preprocess.read_deepeval_cache()

def query(self, retriever: str, query: str) -> tuple[dict, float]:
"""
Expand All @@ -127,8 +111,9 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]:
if retriever != "agent-retriever-reranker"
else f"{self.reranker_base_url}/{endpoint}"
)
payload = {"query": query, "list_context": True, "list_sources": True}
payload = {"query": query, "list_context": True, "list_sources": False}
try:
time.sleep(5)
response = requests.post(url, json=payload)
return response.json(), response.elapsed.total_seconds() * 1000
except Exception as e:
Expand Down
2 changes: 0 additions & 2 deletions evaluation/auto_evaluation/llm_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

retrievers=(
"agent-retriever" \
"ensemble" \
)

echo "==================================="
Expand All @@ -13,6 +12,5 @@ for retriever in "${retrievers[@]}" ; do
--base_url http://localhost:8000 \
--dataset ./dataset/EDA_Corpus_100_Question.csv \
--retriever $retriever
echo "==> Done"
done
echo "==================================="
12 changes: 4 additions & 8 deletions evaluation/auto_evaluation/src/metrics/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,14 @@ def make_contextual_recall_metric(model: DeepEvalBaseLLM) -> ContextualRecallMet
def make_contextual_relevancy_metric(
model: DeepEvalBaseLLM,
) -> ContextualRelevancyMetric:
return ContextualRelevancyMetric(
threshold=RELEVANCY_THRESHOLD,
model=model,
include_reason=True,
raise NotImplementedError(
"ContextualRelevancyMetric is not implemented due to protobuf incompatability"
)


def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
return FaithfulnessMetric(
threshold=FAITHFULNESS_THRESHOLD,
model=model,
include_reason=True,
raise NotImplementedError(
"FaithfulnessMetric is not implemented due to protobuf incompatability"
)


Expand Down
57 changes: 46 additions & 11 deletions evaluation/auto_evaluation/src/models/vertex_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,18 @@
Custom DeepEvalLLM wrapper.
"""

import instructor

from typing import Any

from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
# from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
from vertexai.generative_models import GenerativeModel, HarmBlockThreshold, HarmCategory # type: ignore
from deepeval.models.base_model import DeepEvalBaseLLM
from pydantic import BaseModel


class Response(BaseModel):
content: str


class GoogleVertexAILangChain(DeepEvalBaseLLM):
Expand All @@ -26,17 +34,43 @@ def load_model(self, *args, **kwargs):
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

return ChatVertexAI(
return GenerativeModel(
model_name=self.model_name,
safety_settings=safety_settings,
)

def generate(self, prompt: str) -> Any:
return self.model.invoke(prompt).content
def generate(self, prompt: str, schema: BaseModel) -> Any:
instructor_client = instructor.from_vertexai(
client=self.load_model(),
mode=instructor.Mode.VERTEXAI_TOOLS,
)
resp = instructor_client.messages.create( # type: ignore
messages=[
{
"role": "user",
"content": prompt,
}
],
response_model=schema,
)
return resp

async def a_generate(self, prompt: str) -> Any:
response = await self.model.ainvoke(prompt)
return response.content
async def a_generate(self, prompt: str, schema: BaseModel) -> Any:
instructor_client = instructor.from_vertexai(
client=self.load_model(),
mode=instructor.Mode.VERTEXAI_TOOLS,
_async=True,
)
resp = await instructor_client.messages.create( # type: ignore
messages=[
{
"role": "user",
"content": prompt,
}
],
response_model=schema,
)
return resp

def get_model_name(self):
return self.model_name
Expand All @@ -46,21 +80,22 @@ def main():
model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
prompt = "Write me a joke"
print(f"Prompt: {prompt}")
response = model.generate(prompt)
response = model.generate(prompt, schema=Response)
print(f"Response: {response}")


async def main_async():
model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
prompt = "Write me a joke"
print(f"Prompt: {prompt}")
response = await model.a_generate(prompt)
response = await model.a_generate(prompt, Response)
print(f"Response: {response}")


if __name__ == "__main__":
import asyncio
from dotenv import load_dotenv

load_dotenv()
main()
# asyncio.run(main_async())
# main()
asyncio.run(main_async())
1 change: 1 addition & 0 deletions evaluation/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ deepeval==1.4.9
langchain-google-vertexai==2.0.6
asyncio==3.4.3
huggingface-hub==0.26.2
instructor[vertexai]==1.5.2

1 comment on commit 76f12a6

@luarss
Copy link
Collaborator Author

@luarss luarss commented on 76f12a6 Nov 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

===================================
==> Dataset: EDA Corpus
==> Running tests for agent-retriever
/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/deepeval/init.py:49: UserWarning: You are using deepeval version 1.4.9, however version 1.5.0 is available. You should consider upgrading via the "pip install --upgrade deepeval" command.
warnings.warn(

Fetching 2 files: 0%| | 0/2 [00:00<?, ?it/s]
Fetching 2 files: 50%|█████ | 1/2 [00:00<00:00, 6.96it/s]
Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 13.88it/s]
Traceback (most recent call last):
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/google/cloud/aiplatform/initializer.py", line 356, in project
self._set_project_as_env_var_or_google_auth_default()
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/google/cloud/aiplatform/initializer.py", line 112, in _set_project_as_env_var_or_google_auth_default
credentials, project = google.auth.default()
^^^^^^^^^^^^^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/google/auth/_default.py", line 657, in default
credentials, project_id = checker()
^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/google/auth/_default.py", line 650, in
lambda: _get_explicit_environ_credentials(quota_project_id=quota_project_id),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/google/auth/_default.py", line 270, in _get_explicit_environ_credentials
credentials, project_id = load_credentials_from_file(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/google/auth/_default.py", line 114, in load_credentials_from_file
raise exceptions.DefaultCredentialsError(
google.auth.exceptions.DefaultCredentialsError: File src/secret.json was not found.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/auto_evaluation/eval_main.py", line 145, in
harness = EvaluationHarness(args.base_url, args.dataset, args.reranker_base_url)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/auto_evaluation/eval_main.py", line 45, in init
self.eval_model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/auto_evaluation/src/models/vertex_ai.py", line 24, in init
super().init(model_name, *args, **kwargs)
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/deepeval/models/base_model.py", line 35, in init
self.model = self.load_model(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/auto_evaluation/src/models/vertex_ai.py", line 37, in load_model
return GenerativeModel(
^^^^^^^^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/vertexai/generative_models/_generative_models.py", line 353, in init
project = aiplatform_initializer.global_config.project
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/luarss/actions-runner/_work/ORAssistant/ORAssistant/evaluation/.venv/lib/python3.12/site-packages/google/cloud/aiplatform/initializer.py", line 359, in project
raise GoogleAuthError(project_not_found_exception_str) from exc
google.auth.exceptions.GoogleAuthError: Unable to find your project. Please provide a project ID by:

Please sign in to comment.