diff --git a/docs/references/accuracy.md b/docs/references/accuracy.md index f468679c821..efe2b537449 100644 --- a/docs/references/accuracy.md +++ b/docs/references/accuracy.md @@ -23,110 +23,6 @@ python3 bench_sglang.py --nsub 10 # Test 10 subjects cat result.jsonl | grep -oP '"accuracy": \K\d+\.\d+' ``` -## Benchmark-Specific Implementation Details - -Most benchmarks are similar with task-specific differences. Below, We compare [GSM8K](https://github.com/sgl-project/sglang/tree/main/benchmark/gsm8k) and [MMLU](https://github.com/sgl-project/sglang/tree/main/benchmark/mmlu). - -```python -# GSM8K Evaluation Script - -# Build few-shot prompt with examples -def get_one_example(lines, i, include_answer): - # Basic Q&A format - ret = "Question: " + lines[i]["question"] + "\nAnswer:" - if include_answer: - ret += " " + lines[i]["answer"] - return ret - -def get_few_shot_examples(lines, k): - # Include k annotated examples - ret = "" - for i in range(k): - ret += get_one_example(lines, i, True) + "\n\n" - return ret - -# Create test dataset -for i in range(len(lines[:num_questions])): - questions.append(get_one_example(lines, i, False)) - labels.append(get_answer_value(lines[i]["answer"])) - -# Assemble full prompt -@sgl.function -def few_shot_gsm8k(s, question): - s += few_shot_examples + question - s += sgl.gen( - "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"] - ) - -# Extract numerical answer -def get_answer_value(answer_str): - answer_str = answer_str.replace(",", "") - numbers = re.findall(r"\d+", answer_str) - if len(numbers) < 1: - return INVALID - try: - return ast.literal_eval(numbers[-1]) - except SyntaxError: - return INVALID - -# Run batch inference -states = few_shot_gsm8k.run_batch( - arguments, - temperature=0, - num_threads=args.parallel, - progress_bar=True, - ) -``` - -```python -# MMLU Evaluation Script - -# Format multiple-choice question -def format_example(df, idx, include_answer=True): - prompt = df.iloc[idx, 0] # Question text - k = df.shape[1] - 2 # Number of options - for j in range(k): - prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1]) - prompt += "\nAnswer:" - if include_answer: - prompt += " {}\n\n".format(df.iloc[idx, k + 1]) - return prompt - -def gen_prompt(train_df, subject, k=-1): - # Create subject-specific header - prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format( - format_subject(subject) - ) - if k == -1: - k = train_df.shape[0] - for i in range(k): - prompt += format_example(train_df, i) - return prompt - -# Assemble full prompt -@sgl.function -def few_shot_mmlu(s, examples, question): - s += examples + question + sgl.gen("answer") - -# Batch inference with letter prediction -states = few_shot_mmlu.run_batch( - arguments, - temperature=0, - max_new_tokens=1, # Generate only one token - backend=backend, - num_threads=args.parallel, - progress_bar=True, -) - -# Extract predicted choice -preds = [ - s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" - for s in states -] -``` - -The core implementation is largely similar, differing mainly in prompt definition and response extraction. Other benchmarks can be analyzed in a similar way. - ## Customizing Benchmark Scripts Some benchmark implementations may differ from ours, causing accuracy discrepancies. To match [[Qwen2.5-Math]](https://github.com/QwenLM/Qwen2.5-Math)'s reported 76.8% GSM8K accuracy, customization is required.