Skip to content

Commit

Permalink
fix test_packing_speed (#1202)
Browse files Browse the repository at this point in the history
  • Loading branch information
Qubitium authored Feb 2, 2025
1 parent 5f221f3 commit 33f0991
Showing 1 changed file with 39 additions and 16 deletions.
55 changes: 39 additions & 16 deletions tests/test_packing_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@

# -- do not touch
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# -- end do not touch

import time # noqa: E402
import threadpoolctl # noqa: E402
import unittest # noqa: E402

from parameterized import parameterized # noqa: E402
Expand Down Expand Up @@ -75,42 +75,65 @@ def reshape(w):


class TestRepacking(unittest.TestCase):
k = 2048
n = 1024 * 100
group_size = 128
k = 7168
n = 7168

zeros = torch.full((k // group_size, n), 8, dtype=torch.int32)
print(f"k={k}, n={n}, shape={zeros.shape}, size={zeros.shape[0] * zeros.shape[1] * 4 / 1024 / 1024}M")

print(f"gen_quant: start")
_, linear, s = gen_quant4(k, n, group_size)
print(f"gen_quant: start...end")

def pack(self, qlinearCls):
qlinear = qlinearCls(
bits=4,
group_size=self.group_size,
sym=True,
desc_act=True,
inf_eatures=self.k,
in_features=self.k,
out_features=self.n,
bias=False)
pack_dtype=torch.int32,
bias=False,
)

qlinear.pack(self.linear, self.s.T, self.zeros.T, g_idx=None)

return qlinear

@parameterized.expand(
[
[ExllamaQuantLinear, 26.5349],
[TritonV2QuantLinear, 26.5268],
[TorchQuantLinear, 27.0297],
# [ExllamaQuantLinear, 9.63], # A100 Z3: 36.89 # 4090? 26.5349
# [TritonV2QuantLinear, 9.67], # A100 Z3: 35.04 # 4090? 26.5268
[TorchQuantLinear, 13.819], # A100 Z3 33.56 # 4090? 27.0297
]
)
def test_pack_speed(self, qlinearCls, expect_time):
now = time.time()
for i in range(30):
self.pack(qlinearCls)
time_usage = time.time() - now
speed = self.k * self.k / time_usage
print(f"{qlinearCls.__name__}, time={time_usage}, speed={speed:.4f}")

self.assertLess(abs(time_usage - expect_time) / expect_time, 0.025)
with threadpoolctl.threadpool_limits(limits=1):
now = time.time()
for i in range(30):
self.pack(qlinearCls)
time_usage = time.time() - now
speed = self.k * self.k / time_usage
print(f"{qlinearCls.__name__}, time={time_usage}, speed={speed:.4f}")

self.assertLess(abs(time_usage - expect_time) / expect_time, 0.025, msg=f"time: {time_usage}")

@parameterized.expand(
[
# [ExllamaQuantLinear, 9.63], # A100 Z3: 36.89 # 4090? 26.5349
# [TritonV2QuantLinear, 9.67], # A100 Z3: 35.04 # 4090? 26.5268
[TorchQuantLinear, 10.674], # A100 Z3 33.56 # 4090? 27.0297
]
)
def test_pack_speed_2_threads(self, qlinearCls, expect_time):
with threadpoolctl.threadpool_limits(limits=2):
now = time.time()
for i in range(30):
self.pack(qlinearCls)
time_usage = time.time() - now
speed = self.k * self.k / time_usage
print(f"{qlinearCls.__name__}, time={time_usage}, speed={speed:.4f}")

self.assertLess(abs(time_usage - expect_time) / expect_time, 0.025, msg=f"time: {time_usage}")

0 comments on commit 33f0991

Please sign in to comment.