Skip to content

Commit

Permalink
commit
Browse files Browse the repository at this point in the history
  • Loading branch information
NathanHB committed Feb 18, 2025
1 parent 8acb461 commit 6b3976a
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 14 deletions.
2 changes: 1 addition & 1 deletion src/lighteval/metrics/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def __call_api(prompt):

def __call_api_parallel(self, prompts):
results = []
with ThreadPoolExecutor(100) as executor:
with ThreadPoolExecutor(10) as executor:
for entry in tqdm(executor.map(self.__call_api, prompts), total=len(prompts)):
results.append(entry)

Expand Down
1 change: 1 addition & 0 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
if x is not None
]
else:
breakpoint()
outputs = self.model.generate(
prompt_token_ids=inputs,
sampling_params=sampling_params,
Expand Down
1 change: 0 additions & 1 deletion src/lighteval/tasks/extended/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks


AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]

else:
Expand Down
34 changes: 22 additions & 12 deletions src/lighteval/tasks/extended/hle/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,27 @@ def __init__(self):
response_format=ExtractedAnswer,
)

def compute(self, predictions, formatted_doc: Doc, **kwargs):
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
gold = formatted_doc.get_golds()[0]

score, _, _ = self.judge.evaluate_answer(question=formatted_doc.query, answer=predictions[0], gold=gold)

score["correct_answer"] = gold
return {
"accuracy": score,
"confidence_half_width": score,
"calibration_error": score,
}
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
predictions = [response[0].result[0] for response in responses]
options = [None] * len(questions)

score, _, _ = self.judge.evaluate_answer_batch(questions, predictions, options, golds)

metrics = []
for i in range(len(sample_ids)):
score[i]["correct_answer"] = golds[i]
metrics.append(
{
"accuracy": score[i],
"confidence_half_width": score[i],
"calibration_error": score[i],
}
)

return metrics

def compute_corpus(self, scores: List[dict]):
n = len(scores)
Expand Down Expand Up @@ -193,13 +202,14 @@ def hle_text_only(line, task_name: str = None):
query=f"Question: {line['question']}\nAnswer:",
choices=[line["answer"]],
gold_index=0,
specific={"question": line["question"]},
)


hle_metrics = CorpusLevelMetricGrouping(
metric_name=["accuracy", "confidence_half_width", "calibration_error"],
higher_is_better={n: True for n in ["accuracy", "confidence_half_width", "calibration_error"]},
category=MetricCategory.GENERATIVE,
category=MetricCategory.LLM_AS_JUDGE,
use_case=MetricUseCase.ACCURACY,
sample_level_fn=JudgeLLMHLE().compute,
corpus_level_fn=JudgeLLMHLE().compute_corpus,
Expand Down

0 comments on commit 6b3976a

Please sign in to comment.