Merge branch 'EleutherAI:main' into main

neuralmagic · Nov 27, 2024 · e2243bb · e2243bb
2 parents b0669a8 + 0ef7548
commit e2243bb
Show file tree

Hide file tree

Showing 240 changed files with 5,596 additions and 317 deletions.
diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
@@ -16,7 +16,7 @@ jobs:
     name: Scan for changed tasks
     steps:
       - name: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
 
@@ -47,7 +47,7 @@ jobs:
 
       - name: Set up Python 3.9
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: 3.9
           cache: 'pip'

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -13,7 +13,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: "3.x"
 
@@ -26,7 +26,7 @@ jobs:
     - name: Build a binary wheel and a source tarball
       run: python3 -m build
     - name: Store the distribution packages
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: python-package-distributions
         path: dist/
@@ -46,7 +46,7 @@ jobs:
 
     steps:
     - name: Download all the dists
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: python-package-distributions
         path: dist/
@@ -68,7 +68,7 @@ jobs:
 
     steps:
     - name: Download all the dists
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: python-package-distributions
         path: dist/

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -63,9 +63,9 @@ jobs:
     - name: Test with pytest
       run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
     - name: Archive artifacts
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
-        name: output_results
+        name: output_testcpu${{ matrix.python-version }}
         path: |
           test_logs/*
   testmodels:
@@ -87,9 +87,3 @@ jobs:
         pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Test with pytest
       run: python -m pytest tests/models --showlocals -s -vv
-    - name: Archive artifacts
-      uses: actions/upload-artifact@v3
-      with:
-        name: output_results
-        path: |
-          test_logs/*
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ build
 dist
 *.egg-info
 venv
+.venv/
 .vscode/
 temp
 __pycache__

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 exclude: ^tests/testdata/
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: check-added-large-files
       - id: check-ast
@@ -29,7 +29,7 @@ repos:
       - id: mixed-line-ending
         args: [--fix=lf]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.8
+    rev: v0.7.4
     hooks:
       # Run the linter.
       - id: ruff

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -1 +1 @@
-* @haileyschoelkopf @lintangsutawika @baberabb
+* @baberabb @lintangsutawika
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ This project provides a unified framework to test generative language models on
 
 **Features:**
 - Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
-- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
+- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm).
 - Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/).
 - Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
@@ -319,8 +319,16 @@ lm_eval --model hf \
     --tasks hellaswag
 ```
 
-[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,autogptq=NAME` (or `,autogptq=True` for default names) in the `model_args` argument:
+GPTQ quantized models can be loaded using [GPTQModel](https://github.com/ModelCloud/GPTQModel) (faster) or [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)
 
+GPTQModel: add `,gptqmodel=True` to `model_args`
+```bash
+lm_eval --model hf \
+    --model_args pretrained=model-name-or-path,gptqmodel=True \
+    --tasks hellaswag
+```
+
+AutoGPTQ: add `,autogptq=True` to `model_args`:
 ```bash
 lm_eval --model hf \
     --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \

diff --git a/docs/API_guide.md b/docs/API_guide.md
@@ -91,6 +91,10 @@ When initializing a `TemplateAPI` instance or a subclass, you can provide severa
   - Custom token ID to use as a prefix for inputs.
   - If not provided, uses the model's default BOS or EOS token (if `add_bos_token` is True).
 
+- `verify_certificate` (bool, optional):
+  - Whether to validate the certificate of the API endpoint (if HTTPS).
+  - Default is True.
+
 
 Example usage:
 

diff --git a/docs/chat-template-readme.md b/docs/chat-template-readme.md
@@ -0,0 +1,26 @@
+# Chat Template Delimiter Handling Update
+
+## Overview
+This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter.
+
+## Background
+By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as:
+```
+doc_to_text(doc) + target_delimiter + doc_to_target(doc)
+```
+
+While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently.
+
+## The Change
+- When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace
+- This prevents interference between chat template formatting and the default delimiter system
+- Particularly important for multiple choice tasks where the template itself handles spacing
+
+## Example
+```
+# Before (with default delimiter " ")
+<user>Question: What color is the sky?\nAnswer:<assistant> blue
+
+# After
+<user>Question: What color is the sky?\nAnswer:<assistant>blue
+```
diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
@@ -86,20 +86,20 @@ Let's create a python file in the directory where we're writing our YAML file:
 ```bash
 touch lm_eval/tasks/<dataset_name>/utils.py
 ```
-Now, in `utils.py` we'll write a function to process each split of our dataset:
-
-TODO: Change the example to one that's in the tasks/
+Now, in `utils.py` we'll write a function to process each split of our dataset (the following example is drawn from [the `hellaswag` task](../lm_eval/tasks/hellaswag/utils.py)):
 
 ```python
-def process_docs(dataset: datasets.Dataset):
-    def _helper(doc):
-      # modifies the contents of a single
-      # document in our dataset.
-      doc["choices"] = [doc["choice1"], doc["choice2"], doc["wrong_answer"]]
-      doc["gold"] = doc["label"]
-      return doc
-
-    return dataset.map(_helper) # returns back a datasets.Dataset object
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
 ```
 
 Now, in our YAML config file we'll use the `!function` constructor, and tell the config where our imported Python function will come from. At runtime, before doing anything else we will preprocess our dataset according to this function!

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
@@ -67,7 +67,7 @@ class TaskConfig(dict):
     validation_split: Optional[str] = None
     test_split: Optional[str] = None
     fewshot_split: Optional[str] = (
-        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
     )
     # formatting / prompting options.
     # see docs/advanced_task_guide.md for more info
@@ -449,6 +449,7 @@ def build_all_requests(
                 doc=doc,
                 ctx=fewshot_ctx,
                 metadata=(self.config["task"], doc_id, self.config.repeats),
+                apply_chat_template=apply_chat_template,
             )
 
             if not isinstance(inst, list):
@@ -1301,6 +1302,8 @@ def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
     def construct_requests(
         self, doc: dict, ctx: str, **kwargs
     ) -> Union[List[Instance], Instance]:
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
+
         aux_arguments = None
 
         if self.OUTPUT_TYPE == "loglikelihood":
@@ -1310,6 +1313,8 @@ def construct_requests(
         elif self.OUTPUT_TYPE == "multiple_choice":
             choices = self.doc_to_choice(doc)
             target_delimiter = self.config.target_delimiter
+            if apply_chat_template:
+                target_delimiter = ""
             if self.multiple_input:
                 # If there are multiple inputs, choices are placed in the ctx
                 cont = self.doc_to_target(doc)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
@@ -294,7 +294,9 @@ def _adjust_config(task_dict):
             model_source=model,
             model_args=model_args,
             system_instruction=system_instruction,
-            chat_template=lm.chat_template(apply_chat_template),
+            chat_template=lm.chat_template(apply_chat_template)
+            if apply_chat_template
+            else None,
             fewshot_as_multiturn=fewshot_as_multiturn,
         )
 
@@ -400,6 +402,11 @@ def evaluate(
 
     eval_logger.setLevel(getattr(logging, f"{verbosity}"))
 
+    if apply_chat_template:
+        eval_logger.warning(
+            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
+        )
+
     # tracks all Instances/requests a model must generate output on.
     requests = defaultdict(list)
     # stores the amount to pad out reqs per req. type so that

diff --git a/lm_eval/loggers/wandb_logger.py b/lm_eval/loggers/wandb_logger.py
@@ -15,10 +15,9 @@
 
 def get_wandb_printer() -> Literal["Printer"]:
     """Returns a wandb printer instance for pretty stdout."""
-    from wandb.sdk.lib.printer import get_printer
-    from wandb.sdk.wandb_settings import Settings
+    from wandb.sdk.lib.printer import new_printer
 
-    printer = get_printer(Settings()._jupyter)
+    printer = new_printer()
     return printer
 
 

diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
@@ -5,6 +5,7 @@
     gguf,
     hf_vlms,
     huggingface,
+    ibm_watsonx_ai,
     mamba_lm,
     nemo_lm,
     neuralmagic,

diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
@@ -45,8 +45,8 @@ def anthropic_completion(
 
     try:
         import anthropic
-    except ModuleNotFoundError:
-        raise Exception(
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
             "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
 please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
         )
@@ -108,8 +108,8 @@ def anthropic_chat(
 
     try:
         import anthropic
-    except ModuleNotFoundError:
-        raise Exception(
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
             "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
 please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
         )
@@ -168,8 +168,8 @@ def __init__(
 
         try:
             import anthropic
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                 "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
 please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
             )
@@ -217,8 +217,8 @@ def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
     def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
         try:
             import anthropic
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                 "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
 please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
             )
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ build @@
     dist
     *.egg-info
     venv
+    .venv/
     .vscode/
     temp
     __pycache__
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		* @haileyschoelkopf @lintangsutawika @baberabb
		* @baberabb @lintangsutawika