vllm-project · simon-mo · Feb 14, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -17,7 +17,7 @@
 
 MODELS = [
     "google/gemma-2-2b-it",
-    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
 ]
 
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@@ -96,12 +96,12 @@ def test_models(
     "test_suite", [
         ("facebook/opt-125m", "ray", "", "L4"),
         ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
         ("facebook/opt-125m", "ray", "", "A100"),
         ("facebook/opt-125m", "mp", "", "A100"),
         ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
     ])
 def test_models_distributed(
     hf_runner,
@@ -116,7 +116,7 @@ def test_models_distributed(
     if test_suite != TARGET_TEST_SUITE:
         pytest.skip(f"Skip test for {test_suite}")
 
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+    if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
         # test ray adag
         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
         os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -20,7 +20,7 @@
 
 MODELS = [
     "facebook/opt-125m",
-    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
 ]
 
 
@@ -92,7 +92,7 @@ def test_models_distributed(
 ) -> None:
     override_backend_env_variable(monkeypatch, attention_backend)
 
-    if (model == "meta-llama/Llama-2-7b-hf"
+    if (model == "meta-llama/Llama-3.2-1B-Instruct"
             and distributed_executor_backend == "ray"):
         # test ray adag
         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
@@ -221,7 +221,7 @@ def test_with_prefix_caching(
     Checks exact match decode with and without prefix caching
     with chunked prefill enabled.
     """
-    model = "meta-llama/Llama-2-7b-chat-hf"
+    model = "meta-llama/Llama-3.2-1B-Instruct"
     # The common prompt has 142 tokens with Llama-2 tokenizer.
     common_prompt = "You are a helpful AI assistant " * 20
     unique_prompts = [

diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
@@ -4,5 +4,5 @@
 
 
 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-3.2-1B", [],
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                          ["--cpu-offload-gb", "1"])
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
@@ -118,7 +118,7 @@ def model(x):
 @pytest.mark.parametrize(
     "model",
     [
-        "meta-llama/Llama-3.2-1B",  # sleep mode with safetensors
+        "meta-llama/Llama-3.2-1B-Instruct",  # sleep mode with safetensors
         "facebook/opt-125m"  # sleep mode with pytorch checkpoint
     ])
 def test_end_to_end(model):

diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
@@ -26,7 +26,7 @@ class TestSetting:
 test_settings = [
     # basic llama model
     TestSetting(
-        model="meta-llama/Llama-3.2-1B",
+        model="meta-llama/Llama-3.2-1B-Instruct",
         model_args=[],
         pp_size=2,
         tp_size=2,

diff --git a/tests/compile/utils.py b/tests/compile/utils.py
@@ -6,7 +6,6 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
 
 TEST_MODELS = [
@@ -19,10 +18,10 @@
         "dtype": torch.float16,
         "quantization": "fp8"
     }),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
+    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
         "quantization": "compressed-tensors"
     }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
+    ("meta-llama/Llama-3.2-1B-Instruct", {}),
 ]
 
 if is_quant_method_supported("aqlm"):
@@ -69,11 +68,6 @@ def check_full_graph_support(model,
     # make sure these models can be captured in full graph mode
     os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
 
-    # The base meta llama uses too much memory.
-    if (model == "meta-llama/Meta-Llama-3-8B"
-            and optimization_level >= CompilationLevel.PIECEWISE):
-        return
-
     print(f"MODEL={model}")
 
     prompts = [

@@ -158,7 +158,7 @@ def iter_params(self, model_name: str):
     "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
-    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
+    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
     "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
     "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
     # Uses Llama
@@ -226,7 +226,7 @@ def iter_params(self, model_name: str):
 TEST_MODELS = [
     # [LANGUAGE GENERATION]
     "microsoft/Phi-3.5-MoE-instruct",
-    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-1B-Instruct",
     "ibm/PowerLM-3b",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",

diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
@@ -6,7 +6,7 @@
 
 from vllm import CompletionOutput, LLMEngine, SamplingParams
 
-MODEL = "meta-llama/llama-2-7b-hf"
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 MAX_TOKENS = 200
 
 IS_ASYNC = False

@@ -14,7 +14,7 @@
                                                     OpenAIServingModels)
 from vllm.lora.request import LoRARequest
 
-MODEL_NAME = "meta-llama/Llama-2-7b"
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
 LORA_LOADING_SUCCESS_MESSAGE = (
     "Success: LoRA adapter '{lora_name}' added successfully.")

@@ -5,7 +5,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "meta-llama/Llama-3.2-1B"
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
 
 @pytest.mark.asyncio

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
@@ -28,7 +28,7 @@ def setup_servers():
         "-m",
         "vllm.entrypoints.openai.api_server",
         "--model",
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "meta-llama/Llama-3.2-1B-Instruct",
         "--port",
         "8100",
         "--gpu-memory-utilization",
@@ -49,7 +49,7 @@ def setup_servers():
         "-m",
         "vllm.entrypoints.openai.api_server",
         "--model",
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "meta-llama/Llama-3.2-1B-Instruct",
         "--port",
         "8200",
         "--gpu-memory-utilization",
@@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
     response = requests.post("http://localhost:8100/v1/completions",
                              headers={"Content-Type": "application/json"},
                              json={
-                                 "model":
-                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
                                  "prompt": prompt,
                                  "max_tokens": 1,
                                  "temperature": 0
@@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
     response = requests.post("http://localhost:8200/v1/completions",
                              headers={"Content-Type": "application/json"},
                              json={
-                                 "model":
-                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
                                  "prompt": prompt,
                                  "max_tokens": 10,
                                  "temperature": 0

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
@@ -286,7 +286,7 @@ def long_context_infos(long_context_lora_files_16k_1,
 
 
 @pytest.fixture
-def llama_2_7b_engine_extra_embeddings():
+def llama_3p2_1b_engine_extra_embeddings():
     cleanup_dist_env_and_memory(shutdown_ray=True)
     get_model_old = get_model
 
@@ -296,15 +296,16 @@ def get_model_patched(**kwargs):
         return get_model_old(**kwargs)
 
     with patch("vllm.worker.model_runner.get_model", get_model_patched):
-        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+        engine = vllm.LLM("meta-llama/Llama-3.2-1B-Instruct",
+                          enable_lora=False)
     yield engine.llm_engine
     del engine
     cleanup_dist_env_and_memory(shutdown_ray=True)
 
 
 @pytest.fixture
-def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
-    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
+def llama_3p2_1b_model_extra_embeddings(llama_3p2_1b_engine_extra_embeddings):
+    yield (llama_3p2_1b_engine_extra_embeddings.model_executor.driver_worker.
            model_runner.model)
 
 

diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
@@ -117,7 +117,7 @@ def lora_llm(long_context_infos):
     ]
 
     llm = vllm.LLM(
-        "meta-llama/Llama-2-13b-chat-hf",
+        "meta-llama/Llama-3.2-1B-Instruct",
         enable_lora=True,
         max_num_seqs=16,
         max_loras=2,
@@ -136,7 +136,7 @@ def test_rotary_emb_replaced(dist_init):
     """Verify rotary emb in all the layers are replaced"""
     from vllm.engine.arg_utils import EngineArgs
     from vllm.worker.model_runner import ModelRunner
-    engine_args = EngineArgs("meta-llama/Llama-2-7b-hf",
+    engine_args = EngineArgs("meta-llama/Llama-3.2-1B-Instruct",
                              long_lora_scaling_factors=(4.0, ),
                              enable_lora=True)
     engine_config = engine_args.create_engine_config()

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
@@ -415,15 +415,15 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+def test_lru_cache_worker_adapter_manager(llama_3p2_1b_model_extra_embeddings,
                                           sql_lora_files, device):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        4, 2, llama_3p2_1b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, device,
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
     worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings)
+        llama_3p2_1b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
     worker_adapter_manager.set_active_adapters([
@@ -494,16 +494,16 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+def test_worker_adapter_manager(llama_3p2_1b_model_extra_embeddings,
                                 sql_lora_files, device):
     # Should remove every LoRA not specified in the request.
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
     worker_adapter_manager = WorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        4, 2, llama_3p2_1b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, device,
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
     worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings)
+        llama_3p2_1b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
     worker_adapter_manager.set_active_adapters([

diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
@@ -17,9 +17,9 @@
 def test_worker_apply_lora(sql_lora_files):
     vllm_config = VllmConfig(
         model_config=ModelConfig(
-            "meta-llama/Llama-2-7b-hf",
+            "meta-llama/Llama-3.2-1B-Instruct",
             task="auto",
-            tokenizer="meta-llama/Llama-2-7b-hf",
+            tokenizer="meta-llama/Llama-3.2-1B-Instruct",
             tokenizer_mode="auto",
             trust_remote_code=False,
             seed=0,

@@ -26,12 +26,12 @@
         # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
         ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
          "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
-        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
         ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
          "meta-llama/Llama-3.2-1B-Instruct"),
-        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
-         "meta-llama/Llama-2-7b-chat-hf")
+        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct")
     ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])

@@ -139,7 +139,7 @@ def check_available_online(
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
-    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
+    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),

diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
@@ -1,5 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
-
+# SPDX-License-Identifier: Apache-2.0
+
 # Expanded quantized model tests for CPU offloading
 # Base tests: tests/basic_correctness/test_cpu_offload.py
 
@@ -14,9 +14,9 @@
                     reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
     # Test quantization of an unquantized checkpoint
-    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
                          ["--quantization", "fp8"],
-                         ["--quantization", "fp8", "--cpu-offload-gb", "2"],
+                         ["--quantization", "fp8", "--cpu-offload-gb", "1"],
                          max_wait_seconds=480)
     # Test loading a quantized checkpoint
     compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],

@@ -99,7 +99,7 @@ def test_register_quantization_config():
 
 @pytest.mark.parametrize(argnames="model",
                          argvalues=[
-                             "meta-llama/Meta-Llama-3-8B-Instruct",
+                             "meta-llama/Llama-3.2-1B-Instruct",
                          ])
 def test_custom_quant(vllm_runner, model):
     """Test infer with the custom quantization method."""

diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
@@ -10,7 +10,7 @@
 
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
-MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
+MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]
 
 
 @pytest.mark.parametrize("model", MODELS)

@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "model": "meta-llama/Llama-3.2-1B-Instruct",
     "speculative_model": "JackFram/llama-68m",
     "num_speculative_tokens": 5,
 }])
@@ -27,8 +27,8 @@
         },
         {
             # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
-            "speculative_max_model_len": 4096 + 1,
+            # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
+            "speculative_max_model_len": 131072 + 1,
         },
     ])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])