Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate Llama model usage in tests #13094

Merged
merged 8 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

MODELS = [
"google/gemma-2-2b-it",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
]

TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
Expand Down Expand Up @@ -96,12 +96,12 @@ def test_models(
"test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
hf_runner,
Expand All @@ -116,7 +116,7 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")

if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
Expand Down
6 changes: 3 additions & 3 deletions tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
]


Expand Down Expand Up @@ -92,7 +92,7 @@ def test_models_distributed(
) -> None:
override_backend_env_variable(monkeypatch, attention_backend)

if (model == "meta-llama/Llama-2-7b-hf"
if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"):
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
Expand Down Expand Up @@ -221,7 +221,7 @@ def test_with_prefix_caching(
Checks exact match decode with and without prefix caching
with chunked prefill enabled.
"""
model = "meta-llama/Llama-2-7b-chat-hf"
model = "meta-llama/Llama-3.2-1B-Instruct"
# The common prompt has 142 tokens with Llama-2 tokenizer.
common_prompt = "You are a helpful AI assistant " * 20
unique_prompts = [
Expand Down
2 changes: 1 addition & 1 deletion tests/basic_correctness/test_cpu_offload.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@


def test_cpu_offload():
compare_two_settings("meta-llama/Llama-3.2-1B", [],
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
["--cpu-offload-gb", "1"])
2 changes: 1 addition & 1 deletion tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def model(x):
@pytest.mark.parametrize(
"model",
[
"meta-llama/Llama-3.2-1B", # sleep mode with safetensors
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
"facebook/opt-125m" # sleep mode with pytorch checkpoint
])
def test_end_to_end(model):
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class TestSetting:
test_settings = [
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B",
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
Expand Down
10 changes: 2 additions & 8 deletions tests/compile/utils.py
hmellor marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel
from vllm.platforms import current_platform

TEST_MODELS = [
Expand All @@ -19,10 +18,10 @@
"dtype": torch.float16,
"quantization": "fp8"
}),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Meta-Llama-3-8B", {}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]

if is_quant_method_supported("aqlm"):
Expand Down Expand Up @@ -69,11 +68,6 @@ def check_full_graph_support(model,
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

# The base meta llama uses too much memory.
if (model == "meta-llama/Meta-Llama-3-8B"
and optimization_level >= CompilationLevel.PIECEWISE):
return

print(f"MODEL={model}")

prompts = [
Expand Down
4 changes: 2 additions & 2 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def iter_params(self, model_name: str):
"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
"meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
# Uses Llama
Expand Down Expand Up @@ -226,7 +226,7 @@ def iter_params(self, model_name: str):
TEST_MODELS = [
# [LANGUAGE GENERATION]
"microsoft/Phi-3.5-MoE-instruct",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B-Instruct",
"ibm/PowerLM-3b",
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct",
Expand Down
2 changes: 1 addition & 1 deletion tests/engine/test_stop_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from vllm import CompletionOutput, LLMEngine, SamplingParams

MODEL = "meta-llama/llama-2-7b-hf"
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MAX_TOKENS = 200

IS_ASYNC = False
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
OpenAIServingModels)
from vllm.lora.request import LoRARequest

MODEL_NAME = "meta-llama/Llama-2-7b"
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
LORA_LOADING_SUCCESS_MESSAGE = (
"Success: LoRA adapter '{lora_name}' added successfully.")
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_shutdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "meta-llama/Llama-3.2-1B"
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"


@pytest.mark.asyncio
Expand Down
10 changes: 4 additions & 6 deletions tests/kv_transfer/disagg_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def setup_servers():
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
"--port",
"8100",
"--gpu-memory-utilization",
Expand All @@ -49,7 +49,7 @@ def setup_servers():
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
"--port",
"8200",
"--gpu-memory-utilization",
Expand Down Expand Up @@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
response = requests.post("http://localhost:8100/v1/completions",
headers={"Content-Type": "application/json"},
json={
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.2-1B-Instruct",
"prompt": prompt,
"max_tokens": 1,
"temperature": 0
Expand All @@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
response = requests.post("http://localhost:8200/v1/completions",
headers={"Content-Type": "application/json"},
json={
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.2-1B-Instruct",
"prompt": prompt,
"max_tokens": 10,
"temperature": 0
Expand Down
9 changes: 5 additions & 4 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def long_context_infos(long_context_lora_files_16k_1,


@pytest.fixture
def llama_2_7b_engine_extra_embeddings():
def llama_3p2_1b_engine_extra_embeddings():
cleanup_dist_env_and_memory(shutdown_ray=True)
get_model_old = get_model

Expand All @@ -296,15 +296,16 @@ def get_model_patched(**kwargs):
return get_model_old(**kwargs)

with patch("vllm.worker.model_runner.get_model", get_model_patched):
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
engine = vllm.LLM("meta-llama/Llama-3.2-1B-Instruct",
enable_lora=False)
hmellor marked this conversation as resolved.
Show resolved Hide resolved
yield engine.llm_engine
del engine
cleanup_dist_env_and_memory(shutdown_ray=True)


@pytest.fixture
def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
def llama_3p2_1b_model_extra_embeddings(llama_3p2_1b_engine_extra_embeddings):
yield (llama_3p2_1b_engine_extra_embeddings.model_executor.driver_worker.
model_runner.model)


Expand Down
4 changes: 2 additions & 2 deletions tests/lora/test_long_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def lora_llm(long_context_infos):
]

llm = vllm.LLM(
"meta-llama/Llama-2-13b-chat-hf",
"meta-llama/Llama-3.2-1B-Instruct",
enable_lora=True,
max_num_seqs=16,
max_loras=2,
Expand All @@ -136,7 +136,7 @@ def test_rotary_emb_replaced(dist_init):
"""Verify rotary emb in all the layers are replaced"""
from vllm.engine.arg_utils import EngineArgs
from vllm.worker.model_runner import ModelRunner
engine_args = EngineArgs("meta-llama/Llama-2-7b-hf",
engine_args = EngineArgs("meta-llama/Llama-3.2-1B-Instruct",
long_lora_scaling_factors=(4.0, ),
enable_lora=True)
engine_config = engine_args.create_engine_config()
Expand Down
12 changes: 6 additions & 6 deletions tests/lora/test_lora_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,15 +415,15 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):


@pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
def test_lru_cache_worker_adapter_manager(llama_3p2_1b_model_extra_embeddings,
sql_lora_files, device):
lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
worker_adapter_manager = LRUCacheWorkerLoRAManager(
4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
4, 2, llama_3p2_1b_model_extra_embeddings.unpadded_vocab_size -
lora_config.lora_extra_vocab_size, lora_config, device,
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
worker_adapter_manager.create_lora_manager(
llama_2_7b_model_extra_embeddings)
llama_3p2_1b_model_extra_embeddings)

mapping = LoRAMapping([], [])
worker_adapter_manager.set_active_adapters([
Expand Down Expand Up @@ -494,16 +494,16 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,


@pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
def test_worker_adapter_manager(llama_3p2_1b_model_extra_embeddings,
sql_lora_files, device):
# Should remove every LoRA not specified in the request.
lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
worker_adapter_manager = WorkerLoRAManager(
4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
4, 2, llama_3p2_1b_model_extra_embeddings.unpadded_vocab_size -
lora_config.lora_extra_vocab_size, lora_config, device,
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
worker_adapter_manager.create_lora_manager(
llama_2_7b_model_extra_embeddings)
llama_3p2_1b_model_extra_embeddings)

mapping = LoRAMapping([], [])
worker_adapter_manager.set_active_adapters([
Expand Down
4 changes: 2 additions & 2 deletions tests/lora/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
def test_worker_apply_lora(sql_lora_files):
vllm_config = VllmConfig(
model_config=ModelConfig(
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-3.2-1B-Instruct",
task="auto",
tokenizer="meta-llama/Llama-2-7b-hf",
tokenizer="meta-llama/Llama-3.2-1B-Instruct",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
Expand Down
8 changes: 4 additions & 4 deletions tests/models/decoder_only/language/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct"),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-7b-chat-hf")
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct")
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
Expand Down
2 changes: 1 addition & 1 deletion tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def check_available_online(
trust_remote_code=True),
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
is_available_online=False),
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
Expand Down
8 changes: 4 additions & 4 deletions tests/quantization/test_cpu_offload.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

# SPDX-License-Identifier: Apache-2.0
# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py

Expand All @@ -14,9 +14,9 @@
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "2"],
["--quantization", "fp8", "--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test loading a quantized checkpoint
compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
Expand Down
2 changes: 1 addition & 1 deletion tests/quantization/test_register_quantization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_register_quantization_config():

@pytest.mark.parametrize(argnames="model",
argvalues=[
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
])
def test_custom_quant(vllm_runner, model):
"""Test infer with the custom quantization method."""
Expand Down
2 changes: 1 addition & 1 deletion tests/samplers/test_ignore_eos.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

# We also test with llama because it has generation_config to specify EOS
# (past regression).
MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]


@pytest.mark.parametrize("model", MODELS)
Expand Down
6 changes: 3 additions & 3 deletions tests/spec_decode/e2e/test_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


@pytest.mark.parametrize("common_llm_kwargs", [{
"model": "meta-llama/Llama-2-7b-chat-hf",
"model": "meta-llama/Llama-3.2-1B-Instruct",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
}])
Expand All @@ -27,8 +27,8 @@
},
{
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
"speculative_max_model_len": 4096 + 1,
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_max_model_len": 131072 + 1,
},
])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
Expand Down
Loading