diff --git a/tests/test_packing_speed.py b/tests/test_packing_speed.py index a9e100be..9aa2e3ab 100644 --- a/tests/test_packing_speed.py +++ b/tests/test_packing_speed.py @@ -15,11 +15,11 @@ # -- do not touch import os - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import time # noqa: E402 +import threadpoolctl # noqa: E402 import unittest # noqa: E402 from parameterized import parameterized # noqa: E402 @@ -75,14 +75,16 @@ def reshape(w): class TestRepacking(unittest.TestCase): - k = 2048 - n = 1024 * 100 group_size = 128 + k = 7168 + n = 7168 zeros = torch.full((k // group_size, n), 8, dtype=torch.int32) print(f"k={k}, n={n}, shape={zeros.shape}, size={zeros.shape[0] * zeros.shape[1] * 4 / 1024 / 1024}M") + print(f"gen_quant: start") _, linear, s = gen_quant4(k, n, group_size) + print(f"gen_quant: start...end") def pack(self, qlinearCls): qlinear = qlinearCls( @@ -90,9 +92,11 @@ def pack(self, qlinearCls): group_size=self.group_size, sym=True, desc_act=True, - inf_eatures=self.k, + in_features=self.k, out_features=self.n, - bias=False) + pack_dtype=torch.int32, + bias=False, + ) qlinear.pack(self.linear, self.s.T, self.zeros.T, g_idx=None) @@ -100,17 +104,36 @@ def pack(self, qlinearCls): @parameterized.expand( [ - [ExllamaQuantLinear, 26.5349], - [TritonV2QuantLinear, 26.5268], - [TorchQuantLinear, 27.0297], + # [ExllamaQuantLinear, 9.63], # A100 Z3: 36.89 # 4090? 26.5349 + # [TritonV2QuantLinear, 9.67], # A100 Z3: 35.04 # 4090? 26.5268 + [TorchQuantLinear, 13.819], # A100 Z3 33.56 # 4090? 27.0297 ] ) def test_pack_speed(self, qlinearCls, expect_time): - now = time.time() - for i in range(30): - self.pack(qlinearCls) - time_usage = time.time() - now - speed = self.k * self.k / time_usage - print(f"{qlinearCls.__name__}, time={time_usage}, speed={speed:.4f}") - - self.assertLess(abs(time_usage - expect_time) / expect_time, 0.025) + with threadpoolctl.threadpool_limits(limits=1): + now = time.time() + for i in range(30): + self.pack(qlinearCls) + time_usage = time.time() - now + speed = self.k * self.k / time_usage + print(f"{qlinearCls.__name__}, time={time_usage}, speed={speed:.4f}") + + self.assertLess(abs(time_usage - expect_time) / expect_time, 0.025, msg=f"time: {time_usage}") + + @parameterized.expand( + [ + # [ExllamaQuantLinear, 9.63], # A100 Z3: 36.89 # 4090? 26.5349 + # [TritonV2QuantLinear, 9.67], # A100 Z3: 35.04 # 4090? 26.5268 + [TorchQuantLinear, 10.674], # A100 Z3 33.56 # 4090? 27.0297 + ] + ) + def test_pack_speed_2_threads(self, qlinearCls, expect_time): + with threadpoolctl.threadpool_limits(limits=2): + now = time.time() + for i in range(30): + self.pack(qlinearCls) + time_usage = time.time() - now + speed = self.k * self.k / time_usage + print(f"{qlinearCls.__name__}, time={time_usage}, speed={speed:.4f}") + + self.assertLess(abs(time_usage - expect_time) / expect_time, 0.025, msg=f"time: {time_usage}")