diff --git a/docs/references/accuracy.md b/docs/references/accuracy.md
index f468679c821..efe2b537449 100644
--- a/docs/references/accuracy.md
+++ b/docs/references/accuracy.md
@@ -23,110 +23,6 @@ python3 bench_sglang.py --nsub 10  # Test 10 subjects
 cat result.jsonl | grep -oP '"accuracy": \K\d+\.\d+'
 ```
 
-## Benchmark-Specific Implementation Details
-
-Most benchmarks are similar with task-specific differences. Below, We compare [GSM8K](https://github.com/sgl-project/sglang/tree/main/benchmark/gsm8k) and [MMLU](https://github.com/sgl-project/sglang/tree/main/benchmark/mmlu).
-
-```python
-# GSM8K Evaluation Script
-
-# Build few-shot prompt with examples
-def get_one_example(lines, i, include_answer):
-    # Basic Q&A format
-    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
-    if include_answer:
-        ret += " " + lines[i]["answer"]
-    return ret
-
-def get_few_shot_examples(lines, k):
-    # Include k annotated examples
-    ret = ""
-    for i in range(k):
-        ret += get_one_example(lines, i, True) + "\n\n"
-    return ret
-
-# Create test dataset
-for i in range(len(lines[:num_questions])):
-        questions.append(get_one_example(lines, i, False))
-        labels.append(get_answer_value(lines[i]["answer"]))
-
-# Assemble full prompt
-@sgl.function
-def few_shot_gsm8k(s, question):
-    s += few_shot_examples + question
-    s += sgl.gen(
-        "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
-    )
-
-# Extract numerical answer
-def get_answer_value(answer_str):
-    answer_str = answer_str.replace(",", "")
-    numbers = re.findall(r"\d+", answer_str)
-    if len(numbers) < 1:
-        return INVALID
-    try:
-        return ast.literal_eval(numbers[-1])
-    except SyntaxError:
-        return INVALID
-
-# Run batch inference
-states = few_shot_gsm8k.run_batch(
-        arguments,
-        temperature=0,
-        num_threads=args.parallel,
-        progress_bar=True,
-    )
-```
-
-```python
-# MMLU Evaluation Script
-
-# Format multiple-choice question
-def format_example(df, idx, include_answer=True):
-    prompt = df.iloc[idx, 0]  # Question text
-    k = df.shape[1] - 2  # Number of options
-    for j in range(k):
-        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
-    prompt += "\nAnswer:"
-    if include_answer:
-        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
-    return prompt
-
-def gen_prompt(train_df, subject, k=-1):
-    # Create subject-specific header
-    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
-        format_subject(subject)
-    )
-    if k == -1:
-        k = train_df.shape[0]
-    for i in range(k):
-        prompt += format_example(train_df, i)
-    return prompt
-
-# Assemble full prompt
-@sgl.function
-def few_shot_mmlu(s, examples, question):
-    s += examples + question + sgl.gen("answer")
-
-# Batch inference with letter prediction
-states = few_shot_mmlu.run_batch(
-    arguments,
-    temperature=0,
-    max_new_tokens=1,  # Generate only one token
-    backend=backend,
-    num_threads=args.parallel,
-    progress_bar=True,
-)
-
-# Extract predicted choice
-preds = [
-    s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else ""
-    for s in states
-]
-```
-
-The core implementation is largely similar, differing mainly in prompt definition and response extraction. Other benchmarks can be analyzed in a similar way.
-
 ## Customizing Benchmark Scripts
 
 Some benchmark implementations may differ from ours, causing accuracy discrepancies. To match [[Qwen2.5-Math]](https://github.com/QwenLM/Qwen2.5-Math)'s reported 76.8% GSM8K accuracy, customization is required.