x

Signed-off-by: SumanthRH <[email protected]>
NovaSky-AI · Feb 7, 2025 · 327ed8f · 327ed8f
1 parent e8f1e27
commit 327ed8f
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 16 deletions.
diff --git a/skythought/skythought_evals/README.md b/skythought/skythought_evals/README.md
@@ -75,7 +75,7 @@ python -m skythought_evals.inference_and_check --task numina --model Qwen/QwQ-32
 ### Reproducibility Issues
 
 
-We've noticed that it can be hard to reproduce results in reasoning benchmarks. Beyond the lack of agreed sampling parameters and metrics in the field at the moment, there can be significant differences in results across different evaluation codebases, and even for the same codebase with a different set of dependencies. In bfloat16/ half-precision, numerical error accumulation will change outputs ever so slightly, which can dramatically alter final performance. There are three factors we've noticed that affect results:
+We've noticed that it can be hard to reproduce results in reasoning benchmarks. Beyond the lack of agreed sampling parameters and metrics in the field at the moment, there can be significant differences in results across different evaluation codebases, and even for the same codebase with a different set of dependencies. In half-precision (bfloat16 or float16), numerical error accumulation will change outputs ever so slightly, which can dramatically alter final performance. There are three factors we've noticed that affect results:
 
 - Long context generations: Errors can accumulate so that the output changes at 1k+ tokens, which compound as you keep generating. Since we typically set max tokens to be 16k or 32k tokens, the final solution will change significantly
 - vLLM settings:  With vLLM, we’ve also noticed that at half-precision, different batch sizes can affect downstream evaluation results by a few percentage points. Further, different tensor parallelism settings can also change results in half-precision.\

diff --git a/skythought/skythought_evals/inference_and_check.py b/skythought/skythought_evals/inference_and_check.py
@@ -101,8 +101,8 @@ def _parse_response_for_idx(
     response_entry = SingleParsedResponse(content=content)
 
     token_usage_for_response = {
-            "completion_tokens": response.num_completion_tokens[sample_idx],
-            "prompt_tokens": response.num_input_tokens,
+        "completion_tokens": response.num_completion_tokens[sample_idx],
+        "prompt_tokens": response.num_input_tokens,
     }
     return response_entry, token_usage_for_response
 

diff --git a/skythought/skythought_evals/util/metrics.py b/skythought/skythought_evals/util/metrics.py
@@ -1,19 +1,20 @@
-from typing import Dict, Any
 import logging
-import math
 from collections import defaultdict
+from typing import Any, Dict
 
 import numpy as np
 
-def _pass_at_k(n, c, k): 
-  """ 
-  :param n: total number of samples 
-  :param c: number of correct samples 
-  :param k: k in pass@$k$ 
-  """ 
-  if n - c < k: 
-    return 1.0 
-  return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+def _pass_at_k(n, c, k):
+    """
+    :param n: total number of samples
+    :param c: number of correct samples
+    :param k: k in pass@$k$
+    """
+    if n - c < k:
+        return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
 
 def pass_at_k(N: int, temp_to_scores: Dict[str, Dict[str, Any]]):
     # pass at k per temperature

diff --git a/skythought/skythought_evals/util/response.py b/skythought/skythought_evals/util/response.py
@@ -50,8 +50,14 @@ def from_openai_response(cls, response) -> "Response":
             Responses: New instance initialized with OpenAI response data
         """
         return cls(
-            response=[response.choices[i].message.content for i in range(len(response.choices))],
-            num_completion_tokens=[response.usage.completion_tokens if i == 0 else 0 for i in range(len(response.choices))],
+            response=[
+                response.choices[i].message.content
+                for i in range(len(response.choices))
+            ],
+            num_completion_tokens=[
+                response.usage.completion_tokens if i == 0 else 0
+                for i in range(len(response.choices))
+            ],
             num_input_tokens=response.usage.prompt_tokens,
         )