Skip to content

Commit

Permalink
Fix nightly accuracy tests (#2780)
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy authored Jan 8, 2025
1 parent 6fb5768 commit b22f3f6
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 29 deletions.
2 changes: 1 addition & 1 deletion python/sglang/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
Expand Down
3 changes: 1 addition & 2 deletions test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@
],
"nightly": [
"test_nightly_gsm8k_eval.py",
"test_nightly_human_eval.py",
# Disable temporarly
# Disable temporarily
# "test_nightly_math_eval.py",
],
"sampling/penaltylib": glob.glob(
Expand Down
49 changes: 26 additions & 23 deletions test/srt/test_nightly_gsm8k_eval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json
import os
import subprocess
import unittest
import warnings
from datetime import datetime
Expand All @@ -16,24 +15,26 @@
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
is_in_ci,
popen_launch_server,
write_github_step_summary,
)

MODEL_SCORE_THRESHOLDS = {
"meta-llama/Llama-3.1-8B-Instruct": 0.83,
"meta-llama/Llama-3.1-8B-Instruct": 0.82,
"mistralai/Mistral-7B-Instruct-v0.3": 0.58,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
"google/gemma-2-27b-it": 0.92,
"meta-llama/Llama-3.1-70B-Instruct": 0.96,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63,
"Qwen/Qwen2-57B-A14B-Instruct": 0.87,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
"meta-llama/Llama-3.1-70B-Instruct": 0.95,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
"Qwen/Qwen2-57B-A14B-Instruct": 0.88,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
"neuralmagic/gemma-2-2b-it-FP8": 0.60,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.62,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.94,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83,
Expand Down Expand Up @@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
)
return process

Expand Down Expand Up @@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):

def check_model_scores(results):
failed_models = []
summary = " | model | score | threshold |\n"
summary += "| ----- | ----- | --------- |\n"

for model, score in results:
threshold = MODEL_SCORE_THRESHOLDS.get(model)
if threshold is None:
Expand All @@ -111,11 +114,19 @@ def check_model_scores(results):
f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
)

line = f"| {model} | {score} | {threshold} |\n"
summary += line

print(summary)

if is_in_ci():
write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}")

if failed_models:
raise AssertionError("\n".join(failed_models))


class TestEvalAccuracyLarge(unittest.TestCase):
class TestNightlyGsm8KEval(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
Expand All @@ -127,13 +138,6 @@ def setUpClass(cls):
]
cls.base_url = DEFAULT_URL_FOR_TEST

def setUp(self):
self.process = None

def tearDown(self):
if self.process:
kill_process_tree(self.process.pid)

def test_mgsm_en_all_models(self):
warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket"
Expand All @@ -144,7 +148,7 @@ def test_mgsm_en_all_models(self):
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
with self.subTest(model=model):
self.process = launch_server(self.base_url, model, is_fp8, is_tp2)
process = launch_server(self.base_url, model, is_fp8, is_tp2)

args = SimpleNamespace(
base_url=self.base_url,
Expand All @@ -163,8 +167,7 @@ def test_mgsm_en_all_models(self):
is_first = False

all_results.append((model, metrics["score"]))

self.tearDown()
kill_process_tree(process.pid)

try:
with open("results.json", "r") as f:
Expand Down
2 changes: 1 addition & 1 deletion test/srt/test_nightly_human_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
)


class TestEvalAccuracyLarge(unittest.TestCase):
class TestNightlyHumanEval(unittest.TestCase):
@classmethod
def setUpClass(cls):
if is_in_ci():
Expand Down
6 changes: 4 additions & 2 deletions test/srt/test_skip_tokenizer_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1):
print(json.dumps(ret))

def assert_one_item(item):
assert len(item["token_ids"]) == item["meta_info"]["completion_tokens"]
assert len(item["token_ids"]) == max_new_tokens
self.assertEqual(
len(item["token_ids"]), item["meta_info"]["completion_tokens"]
)
self.assertEqual(len(item["token_ids"]), max_new_tokens)
assert item["meta_info"]["prompt_tokens"] == len(input_ids)

if return_logprob:
Expand Down

0 comments on commit b22f3f6

Please sign in to comment.