diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index ecde8fbaa15b8..b7bc840508169 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -23,6 +23,9 @@ from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.lora.v1.ops.triton_ops.v1_expand import v1_expand +from vllm.lora.v1.ops.triton_ops.v1_shrink import v1_shrink +from vllm.lora.v1.punica_wrapper.punica_gpu_v1 import V1KernelMeta from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) @@ -172,6 +175,8 @@ class OpType(Enum): SGMV_EXPAND = auto() BGMV_EXPAND = auto() BGMV_EXPAND_SLICE = auto() + V1_SHRINK = auto() + V1_EXPAND = auto() @staticmethod def from_str(s: str) -> "OpType": @@ -185,28 +190,43 @@ def from_str(s: str) -> "OpType": return OpType.BGMV_EXPAND if s.lower() == "bgmv_expand_slice": return OpType.BGMV_EXPAND_SLICE + if s.lower() == "v1_shrink": + return OpType.V1_SHRINK + if s.lower() == "v1_expand": + return OpType.V1_EXPAND raise ValueError(f"Unrecognized str {s} to convert to OpType") def is_shrink_fn(self) -> bool: - return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK] + return self in [ + OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK + ] def is_expand_fn(self) -> bool: - return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND] + return self in [ + OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND + ] def is_prefill_op(self) -> bool: - return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND] + return self in [ + OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK, + OpType.V1_EXPAND + ] def is_decode_op(self) -> bool: return self in [ - OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE + OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE, + OpType.V1_SHRINK, OpType.V1_EXPAND ] def is_expand_slice_fn(self) -> bool: return self in [OpType.BGMV_EXPAND_SLICE] def num_slices(self) -> List[int]: - if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]: - # SGMV kernels supports slices + if self in [ + OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK, + OpType.V1_EXPAND + ]: + # SGMV kernels and v1 kernels supports slices return [1, 2, 3] if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]: return [1] @@ -251,11 +271,13 @@ def matmul_shapes( m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) b_shape = (num_loras, n, k) # col-major - if self == OpType.SGMV_SHRINK: - # SGMV shrink supports num_slices inherently in the kernel + if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]: + # SGMV shrink and V1 shrink kernels support num_slices inherently + # in the kernel. return ((m, k), b_shape, (num_slices, m, n)) - if self == OpType.SGMV_EXPAND: - # SGMV expand supports num_slices inherently in the kernel + if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]: + # SGMV expand and V1 expand kernels support num_slices inherently + # in the kernel return ((num_slices, m, k), b_shape, (m, n * num_slices)) if self == OpType.BGMV_SHRINK: return ((m, k), b_shape, (m, n)) @@ -282,25 +304,30 @@ def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): return bgmv_expand if self == OpType.BGMV_EXPAND_SLICE: return emulate_bgmv_expand_slice + if self == OpType.V1_SHRINK: + return v1_shrink + if self == OpType.V1_EXPAND: + return v1_expand + raise ValueError(f"Unrecognized optype {self}") def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, lora_weights: List[torch.Tensor], **kwargs) -> Callable: - """Each benchmark operation expected the input, lora_weights and outputs + """Each benchmark operation expects the input, lora_weights and outputs in a slightly different format. Refer to self.matmul_shapes(). run_ref_group_gemm accounts for those differences in executing a reference group gemm for correctness testing. """ w_dtype = lora_weights[0].dtype num_slices = len(lora_weights) - if self == OpType.SGMV_SHRINK: + if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]: for slice_idx in range(num_slices): ref_group_gemm(ref_out=output[slice_idx, :], input=input, lora_weights=lora_weights[slice_idx], **kwargs) - if self == OpType.SGMV_EXPAND: + elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]: hidden_size = lora_weights[0].shape[1] for slice_idx in range(num_slices): slice_offset = slice_idx * hidden_size @@ -309,19 +336,19 @@ def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, input=input[slice_idx].clone().to(dtype=w_dtype), lora_weights=lora_weights[slice_idx], **kwargs) - if self == OpType.BGMV_SHRINK: + elif self == OpType.BGMV_SHRINK: assert num_slices == 1 ref_group_gemm(ref_out=output, input=input, lora_weights=lora_weights[0], **kwargs) - if self == OpType.BGMV_EXPAND: + elif self == OpType.BGMV_EXPAND: assert num_slices == 1 ref_group_gemm(ref_out=output, input=input.clone().to(dtype=w_dtype), lora_weights=lora_weights[0], **kwargs) - if self == OpType.BGMV_EXPAND_SLICE: + elif self == OpType.BGMV_EXPAND_SLICE: hidden_size = lora_weights[0].shape[1] for slice_idx in range(num_slices): slice_offset = slice_idx * hidden_size @@ -330,7 +357,8 @@ def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, input=input[slice_idx].clone().to(dtype=w_dtype), lora_weights=lora_weights[slice_idx], **kwargs) - raise ValueError(f"Unrecognized optype {self}") + else: + raise ValueError(f"Unrecognized optype {self}") @dataclass @@ -391,6 +419,8 @@ class BenchmarkTensors: seq_start_loc: torch.Tensor prompt_lora_mapping: torch.Tensor token_lora_mapping: torch.Tensor + # v1 kernel metadata + v1_kernel_meta: Optional[V1KernelMeta] = None def io_types(self) -> str: return (f"{dtype_to_str(self.input.dtype)}x" @@ -433,10 +463,19 @@ def make(ctx: BenchmarkContext, total_tokens, ctx.batch_size, prompt_lora_indices_tensor, seq_len_tensor, "cpu") + v1_kernel_meta = None + if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]: + v1_kernel_meta = V1KernelMeta.make( + max_loras=ctx.num_loras, + max_num_tokens=token_lora_indices_tensor.size(0), + device="cpu") + v1_kernel_meta.prepare_tensors( + token_lora_mapping=token_lora_indices_tensor) + return BenchmarkTensors(input_tensor, lora_weights, output_tensor, seq_len_tensor, seq_start_loc_tensor, prompt_lora_indices_tensor, - token_lora_indices_tensor) + token_lora_indices_tensor, v1_kernel_meta) def sanity_check(self) -> None: """ @@ -469,6 +508,13 @@ def to_device(tensor: torch.Tensor): for i in range(len(self.lora_weights_lst)): self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) + # v1 meta + if self.v1_kernel_meta: + for field_name in V1KernelMeta.__dataclass_fields__: + field = getattr(self.v1_kernel_meta, field_name) + assert isinstance(field, torch.Tensor) + setattr(self.v1_kernel_meta, field_name, to_device(field)) + def metadata(self) -> Tuple[int, int, int]: """ Return num_seqs, num_tokens and max_seq_len @@ -668,6 +714,78 @@ def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: }) return {'kwargs_list': kwargs_list} + def as_v1_shrink_kwargs(self) -> Dict[str, Any]: + assert self.v1_kernel_meta is not None + self.sanity_check() + self.to_device(self.input.device) + + _, num_tokens, _, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_slices, num_tokens, lora_rank] + assert len(o_shape) == 3 + assert o_shape == (num_slices, num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping, + 'token_indices_sorted_by_lora_ids': + self.v1_kernel_meta.token_indices_sorted_by_lora_ids, + 'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora, + 'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc, + 'lora_ids': self.v1_kernel_meta.active_lora_ids, + 'scaling': 1.0, + } + + def as_v1_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + assert self.v1_kernel_meta is not None + self.sanity_check() + self.to_device(self.input.device) + + _, num_tokens, _, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape : [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape : [num_lora, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape : [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping, + 'token_indices_sorted_by_lora_ids': + self.v1_kernel_meta.token_indices_sorted_by_lora_ids, + 'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora, + 'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc, + 'lora_ids': self.v1_kernel_meta.active_lora_ids, + 'offset_start': 0, + 'add_inputs': add_inputs, + } + def bench_fn_kwargs(self, op_type: OpType, add_inputs: Optional[bool] = None) -> Dict[str, Any]: @@ -686,6 +804,10 @@ def bench_fn_kwargs(self, return self.as_bgmv_expand_kwargs(add_inputs) if op_type == OpType.BGMV_EXPAND_SLICE: return self.as_bgmv_expand_slice_kwargs(add_inputs) + if op_type == OpType.V1_SHRINK: + return self.as_v1_shrink_kwargs() + if op_type == OpType.V1_EXPAND: + return self.as_v1_expand_kwargs(add_inputs) raise ValueError(f"Unrecognized optype {self}") def test_correctness(self, op_type: OpType, diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index 032e20470bcd3..03cafdaec111d 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -6,10 +6,12 @@ import torch import vllm.lora.ops.triton_ops # noqa: F401 +import vllm.lora.v1.ops.triton_ops # noqa: F401 from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand, sgmv_expand_slice, sgmv_shrink) from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.lora.v1.punica_wrapper.punica_gpu_v1 import V1KernelMeta from vllm.platforms import current_platform from .utils import (PunicaTensors, assert_close, generate_data, @@ -313,6 +315,111 @@ def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int, assert_close(data.our_out_tensor, data.ref_out_tensor) +def check_v1_shrink(batches: int, num_loras: int, rank: int, hidden_size: int, + nslices: int, dtype: torch.dtype, device: str, + seq_length: int, scaling: float): + """ + Compare vllm.v1_shrink against a reference implementation. + """ + data: PunicaTensors = generate_data_for_nslices( + batches, + hidden_size, + num_loras, + rank, + seq_length, + nslices, + dtype, + "shrink", + device, + ) + max_seq_length, token_nums = data.meta() + + # Setup metadata information for reference sgmv kernels + sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor, + data.prompt_lora_mapping, batches, max_seq_length, + token_nums) + + # Setup metadata information for the kernel. + v1_meta = V1KernelMeta.make(max_loras=num_loras, + max_num_tokens=token_nums, + device='cuda') + v1_meta.reset() + v1_meta.prepare_tensors(data.token_lora_mapping) + + # Preventing cache error pointer. + with _dict_lock: + _LORA_A_PTR_DICT.clear() + torch.ops.vllm.v1_shrink( + data.inputs_tensor, + data.lora_weights, + data.our_out_tensor, + *v1_meta.meta_args(num_tokens=token_nums), + scaling, + ) + + sgmv_shrink_for_nslices( + nslices, + data.inputs_tensor, + data.lora_weights, + data.ref_out_tensor, + *sgmv_meta_args, + scaling, + ) + + assert_close(data.our_out_tensor, data.ref_out_tensor) + + +def check_v1_expand(batches: int, num_loras: int, rank: int, hidden_size: int, + nslices: int, dtype: torch.dtype, device: str, + seq_length: int, add_inputs: bool): + """ + Compare vllm.v1_expand against a reference implementation. + """ + data: PunicaTensors = generate_data_for_nslices( + batches, + hidden_size, + num_loras, + rank, + seq_length, + nslices, + dtype, + "expand", + device, + ) + max_seq_length, token_nums = data.meta() + + # Setup metadata information for reference sgmv kernels + sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor, + data.prompt_lora_mapping, batches, max_seq_length, + token_nums) + + # Setup metadata information for the kernel. + v1_meta = V1KernelMeta.make(max_loras=num_loras, + max_num_tokens=token_nums, + device='cuda') + v1_meta.reset() + v1_meta.prepare_tensors(data.token_lora_mapping) + + with _dict_lock: + _LORA_B_PTR_DICT.clear() + torch.ops.vllm.v1_expand(data.inputs_tensor, + data.lora_weights, + data.our_out_tensor, + *v1_meta.meta_args(num_tokens=token_nums), + offset_start=0, + add_inputs=add_inputs) + + sgmv_expand_for_nslices(nslices, + hidden_size, + data.inputs_tensor, + data.lora_weights, + data.ref_out_tensor, + *sgmv_meta_args, + add_inputs=add_inputs) + + assert_close(data.our_out_tensor, data.ref_out_tensor) + + # Tests # We test the punica kernels along 2 verticals mainly. # 1. Variations in hidden_dim size @@ -650,3 +757,82 @@ def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int, dtype=dtype, device=device, add_inputs=True) + + +@pytest.mark.parametrize("batches", test_params['batches']) +@pytest.mark.parametrize("num_loras", test_params['num_loras']) +@pytest.mark.parametrize("rank", test_params['max_ranks']) +@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes']) +@pytest.mark.parametrize("nslices", [1, 2, 3]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("seq_length", [1, 128]) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) +def test_v1_kernels(batches: int, num_loras: int, rank: int, hidden_size: int, + nslices: int, dtype: torch.dtype, device: str, seed: int, + seq_length: int, op_type: str): + + torch.set_default_device(device) + current_platform.seed_everything(seed) + + if op_type == "shrink": + check_v1_shrink(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + seq_length=seq_length, + scaling=0.5) + else: + check_v1_expand(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + seq_length=seq_length, + add_inputs=True) + + +@pytest.mark.parametrize("batches", hs_test_params['batches']) +@pytest.mark.parametrize("num_loras", hs_test_params['num_loras']) +@pytest.mark.parametrize("rank", hs_test_params['max_ranks']) +@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes']) +@pytest.mark.parametrize("nslices", [1, 2, 3]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("seq_length", [1, 13]) +@pytest.mark.parametrize("op_type", ["shrink", "expand"]) +def test_v1_kernels_hidden_size(batches: int, num_loras: int, rank: int, + hidden_size: int, nslices: int, + dtype: torch.dtype, device: str, seed: int, + seq_length: int, op_type: str): + + torch.set_default_device(device) + current_platform.seed_everything(seed) + + if op_type == "shrink": + check_v1_shrink(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + seq_length=seq_length, + scaling=0.5) + else: + check_v1_expand(batches=batches, + num_loras=num_loras, + rank=rank, + hidden_size=hidden_size, + nslices=nslices, + dtype=dtype, + device=device, + seq_length=seq_length, + add_inputs=True) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index ef77fd4b74cec..d7e5e77f94f85 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -324,9 +324,11 @@ def __init__( self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots self.vocab_size = vocab_size self.long_lora_context: Optional[LongContextLoRAContext] = None - self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens, - max_batches=self.max_num_seqs, - device=self.device) + self.punica_wrapper = get_punica_wrapper( + max_num_batched_tokens, + max_batches=self.max_num_seqs, + device=self.device, + max_loras=self.lora_config.max_loras) # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py new file mode 100644 index 0000000000000..d7b95d76027bb --- /dev/null +++ b/vllm/lora/ops/triton_ops/kernel_utils.py @@ -0,0 +1,244 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Utilities for Punica kernel construction. +""" +import triton +import triton.language as tl + + +@triton.jit +def mm_k(a_ptr, b_ptr, ak_stride, bk_stride, offset_k, K: tl.constexpr, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, CAST_TYPE: tl.constexpr, + b_dtype: tl.constexpr): + """ + Given a_ptr and b_ptr, that identify the rows of A (m x k) and columns of + B (k x n), iterate, through the K dimension to compute the partial/complete + matrix block product. + If SPLIT_K == 1, the output m x n product is complete. + If SPLIT_K > 1, the thread block computes partial outputs. The partial + outputs are then atomically summed in the caller code. + Args: + a_ptr: Array of pointers, identifying rows of A + b_ptr: Array of pointers, identifying columns of B + ak_stride: K dimension stride of the A matrix + bk_stride: K dimension stride of the B matrix + K: Length of the K dimension + BLOCK_M: M dimension of the output block m x n + BLOCK_N: N dimension of the output block m x n + BLOCK_K: K dimension atom + EVEN_K: True iff the blocks of A and B can be loaded without any + masking. + SPLIT_K: Parameter signifying parallelism in the K dimension. + CAST_TYPE: if True, cast the values from the A matrix to the B + matrix dtype. + b_dtype: datatype of the B matrix + """ + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + tiled_a = tl.load(a_ptr) + tiled_b = tl.load(b_ptr) + else: + tiled_a = tl.load(a_ptr, + mask=offset_k[None, :] + < K - k * (BLOCK_K * SPLIT_K), + other=0) + tiled_b = tl.load(b_ptr, + mask=offset_k[:, None] + < K - k * (BLOCK_K * SPLIT_K), + other=0) + if CAST_TYPE: + tiled_a = tiled_a.to(b_dtype) + accumulator += tl.dot( + tiled_a, + tiled_b, + ) + a_ptr += BLOCK_K * SPLIT_K * ak_stride + b_ptr += BLOCK_K * SPLIT_K * bk_stride + return accumulator + + +@triton.jit +def do_expand_kernel( + pid_n, + lora_index, + slice_id, + input_ptr, + lora_ptr, + out_ptr, + N, + K, + M_LEN, + ram, # array identifying the rows of Input ptr to operate on + slice_start_loc, + # input ptr strides + input_d0_stride, + input_d1_stride, + input_d2_stride, + # lora ptr strides + ls_d0_ptr, + ls_d1_ptr, + ls_d2_ptr, + # out ptr strides + output_d0_stride, + output_d1_stride, + # constants + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + SAME_STRIDE: tl.constexpr, + SLICE_NUM: tl.constexpr, + EVEN_K: tl.constexpr, + CAST_TYPE: tl.constexpr, + ADD_INPUTS: tl.constexpr, +): + """ + Given an array of integers that identifies the rows of A, ram, + a lora index that identifies which LoRA to use from lora_ptr, lora_index, + a slice_id that identifies the input/output slice, + compute the matrix product and store in the appropriate output location. + + Given that this is an expand kernel, we don't perform any split-K reduction + as the K dimension is assumed to be small. + """ + + # ls_d*_ptr can be either an integer or a pointer + if SAME_STRIDE: + # integer + cur_lora_d0_stride = ls_d0_ptr + cur_lora_d1_stride = ls_d1_ptr + cur_lora_d2_stride = ls_d2_ptr + else: + # pointer + cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id) + cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id) + cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id) + + # Identify the input_ptr and lora_ptr from slice_id. + if SLICE_NUM == 1: + cur_input_ptr = input_ptr + cur_lora_ptr = lora_ptr + else: + cur_input_ptr = input_ptr + slice_id * input_d0_stride + cur_lora_ptr = tl.load(lora_ptr + slice_id).to( + tl.pointer_type(out_ptr.dtype.element_ty)) + + # Identify the column indices of B to process. + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + + # Identify the row pointers of A and column pointers of B + offset_k = tl.arange(0, BLOCK_K) + a_ptr = (cur_input_ptr + ram[:, None] * input_d1_stride + + offset_k[None, :] * input_d2_stride, ) + b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index + + offset_k[:, None] * cur_lora_d2_stride + + rbn[None, :] * cur_lora_d1_stride) + + # Compute the block matrix product. + SPLIT_K = 1 + accumulator = mm_k(a_ptr, b_ptr, input_d2_stride, cur_lora_d2_stride, + offset_k, K, BLOCK_M, BLOCK_N, BLOCK_K, EVEN_K, SPLIT_K, + CAST_TYPE, cur_lora_ptr.dtype.element_ty) + + tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty) + if SLICE_NUM == 1: + cur_slice_start = slice_start_loc + else: + cur_slice_start = tl.load(slice_start_loc + slice_id) + + # Identify the C output pointers to store the results of the accumulator. + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start + offset_cm = tl.arange(0, BLOCK_M) + c_ptr = (out_ptr + ram[:, None] * output_d0_stride + + offset_cn[None, :] * output_d1_stride) + c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] + < (cur_slice_start + N)) + + if ADD_INPUTS: + tiled_out = tl.load(c_ptr, mask=c_mask) + tiled_c += tiled_out + tl.store(c_ptr, tiled_c, mask=c_mask) + + +@triton.jit +def do_shrink_kernel( + pid_n, + pid_sk, + slice_id, + lora_index, + input_ptr, + lora_ptr, + out_ptr, + N, + K, + M_LEN, + ram, + # input strides + input_d0_stride, + input_d1_stride, + # lora strides + lora_d0_stride, + lora_d1_stride, + lora_d2_stride, + # output strides + output_d0_stride, + output_d1_stride, + output_d2_stride, + scaling, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + SPLIT_K: tl.constexpr, + SLICE_NUM: tl.constexpr, +): + """ + Given an array of integers that identifies the rows of A, ram, + a lora index that identifies which LoRA to use from lora_ptr, lora_index, + a slice_id that identifies the input/output slice, compute the + matrix product and store in the appropriate output location. + """ + + # Identify the lora_ptr from slice_id. + if SLICE_NUM == 1: + # current lora ptr + cur_lora_ptr = lora_ptr + else: + # current lora ptr + cur_lora_ptr = tl.load(lora_ptr + slice_id).to( + tl.pointer_type(input_ptr.dtype.element_ty)) + + # Identify the column indices of B to process. + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + + # Identify the row pointers of A and column pointers of B + offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K) + a_ptr = (input_ptr + ram[:, None] * input_d0_stride + + offset_k[None, :] * input_d1_stride) + b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index + + rbn[None, :] * lora_d1_stride + + offset_k[:, None] * lora_d2_stride) + + # Compute partial/complete block matrix product. + accumulator = mm_k(a_ptr, b_ptr, input_d1_stride, lora_d2_stride, offset_k, + K, BLOCK_M, BLOCK_N, BLOCK_K, EVEN_K, SPLIT_K, False, + cur_lora_ptr.dtype.element_ty) + + # Identify the C output pointers to store the results of the accumulator. + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + offset_cm = tl.arange(0, BLOCK_M) + cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr + + slice_id * output_d0_stride) + c_ptr = cur_out_ptr + ram[:, None] * output_d1_stride + offset_cn[ + None, :] * output_d2_stride + c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] < N) + + accumulator *= scaling + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(c_ptr, accumulator, mask=c_mask) + else: + tl.atomic_add(c_ptr, accumulator, mask=c_mask) diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index a8e71cacfe5a2..4eeda9876d84e 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -14,6 +14,7 @@ from vllm.utils import direct_register_custom_op +from .kernel_utils import do_expand_kernel from .utils import _get_lora_b_ptr @@ -63,86 +64,56 @@ def _sgmv_expand_kernel( curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id) pid_m = pid // cta_n_num pid_n = pid % cta_n_num - M = tl.load(seq_lens + cur_batch) - if pid_m * BLOCK_M > M: + + M_LEN = tl.load(seq_lens + cur_batch) + if pid_m * BLOCK_M >= M_LEN: return - if pid_n * BLOCK_N > curr_N: + if pid_n * BLOCK_N >= curr_N: return lora_index = tl.load(lora_indices + cur_batch) if lora_index == -1: return - cur_seq_start = tl.load(b_seq_start_loc + cur_batch) - offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M - offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N - offset_k = tl.arange(0, BLOCK_K) - ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(offset_n % curr_N, BLOCK_N), - BLOCK_N) - # ls_d*_ptr can be either an integer or a pointer - if SAME_STRIDE: - # integer - cur_lora_d0_stride = ls_d0_ptr - cur_lora_d1_stride = ls_d1_ptr - cur_lora_d2_stride = ls_d2_ptr - else: - # pointer - cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id) - cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id) - cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id) - if SLICE_NUM == 1: - cur_input_ptr = input_ptr - cur_lora_ptr = lora_ptr - - else: - cur_input_ptr = input_ptr + slice_id * input_d0_stride - cur_lora_ptr = tl.load(lora_ptr + slice_id).to( - tl.pointer_type(out_ptr.dtype.element_ty)) - - a_ptr = (cur_input_ptr + cur_seq_start * input_d1_stride + - ram[:, None] * input_d1_stride + - offset_k[None, :] * input_d2_stride, ) - b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index + - offset_k[:, None] * cur_lora_d2_stride + - rbn[None, :] * cur_lora_d1_stride) - accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) - for k in range(tl.cdiv(K, BLOCK_K)): - if EVEN_K: - tiled_a = tl.load(a_ptr) - tiled_b = tl.load(b_ptr) - else: - tiled_a = tl.load(a_ptr, - mask=offset_k[None, :] < K - k * BLOCK_K, - other=0) - tiled_b = tl.load(b_ptr, - mask=offset_k[:, None] < K - k * BLOCK_K, - other=0) - if CAST_TYPE: - tiled_a = tiled_a.to(cur_lora_ptr.dtype.element_ty) - accumulator += tl.dot( - tiled_a, - tiled_b, - ) - a_ptr += BLOCK_K * input_d2_stride - b_ptr += BLOCK_K * cur_lora_d2_stride - - tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty) - if SLICE_NUM == 1: - cur_slice_start = slice_start_loc - else: - cur_slice_start = tl.load(slice_start_loc + slice_id) - - offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M - offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start - c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride + - offset_cn[None, :] * output_d1_stride) - M = tl.load(seq_lens + cur_batch) - c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( - offset_cn[None, :] < (cur_slice_start + curr_N)) - if ADD_INPUTS: - tiled_out = tl.load(c_ptr, mask=c_mask) - tiled_c += tiled_out - tl.store(c_ptr, tiled_c, mask=c_mask) + M_OFFSET = tl.load(b_seq_start_loc + cur_batch) + + CTA_M_LEN = min(BLOCK_M, M_LEN - (pid_m * BLOCK_M)) + CTA_M_OFFSET = M_OFFSET + (pid_m * BLOCK_M) + offset_m = tl.arange(0, BLOCK_M) + ram = CTA_M_OFFSET + tl.max_contiguous( + tl.multiple_of(offset_m % CTA_M_LEN, BLOCK_M), BLOCK_M) + do_expand_kernel( + pid_n, + lora_index, + slice_id, + input_ptr, + lora_ptr, + out_ptr, + curr_N, + K, + CTA_M_LEN, + ram, # array identifying the rows of Input ptr to operate on + slice_start_loc, + # input ptr strides + input_d0_stride, + input_d1_stride, + input_d2_stride, + # lora ptr strides + ls_d0_ptr, + ls_d1_ptr, + ls_d2_ptr, + # out ptr strides + output_d0_stride, + output_d1_stride, + # constants + BLOCK_M, + BLOCK_N, + BLOCK_K, + SAME_STRIDE, + SLICE_NUM, + EVEN_K, + CAST_TYPE, + ADD_INPUTS, + ) @torch.inference_mode() diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index 8b26583c11c14..a74770878b995 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -14,6 +14,7 @@ from vllm.utils import direct_register_custom_op +from .kernel_utils import do_shrink_kernel from .utils import _get_lora_a_ptr @@ -61,68 +62,50 @@ def _sgmv_shrink_kernel( slice_id = pid_mix // SPLIT_K pid_sk = pid_mix % SPLIT_K - M = tl.load(seq_lens + cur_batch) - if pid_m * BLOCK_M > M: + M_LEN = tl.load(seq_lens + cur_batch) + if pid_m * BLOCK_M >= M_LEN: return lora_index = tl.load(lora_indices + cur_batch) if lora_index == -1: return - cur_seq_start = tl.load(b_seq_start_loc + cur_batch) - offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M - offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N - offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K) - - ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) - # input ptr - a_ptr = (input_ptr + cur_seq_start * input_d0_stride + - ram[:, None] * input_d0_stride + - offset_k[None, :] * input_d1_stride) - - if SLICE_NUM == 1: - # current lora ptr - cur_lora_ptr = lora_ptr - else: - # current lora ptr - cur_lora_ptr = tl.load(lora_ptr + slice_id).to( - tl.pointer_type(input_ptr.dtype.element_ty)) - - b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index + - rbn[None, :] * lora_d1_stride + - offset_k[:, None] * lora_d2_stride) - - accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - tiled_a = tl.load(a_ptr) - tiled_b = tl.load(b_ptr) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - tiled_a = tl.load(a_ptr, - mask=offset_k[None, :] < k_remaining, - other=0.0) - tiled_b = tl.load(b_ptr, - mask=offset_k[:, None] < k_remaining, - other=0.0) - accumulator += tl.dot(tiled_a, tiled_b) - - a_ptr += BLOCK_K * SPLIT_K * input_d1_stride - b_ptr += BLOCK_K * SPLIT_K * lora_d2_stride - offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M - - offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N - cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr + - slice_id * output_d0_stride) - c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[ - None, :] * output_d2_stride - c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] - < N) - accumulator *= scaling - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(c_ptr, accumulator, mask=c_mask) - else: - tl.atomic_add(c_ptr, accumulator, mask=c_mask) + M_OFFSET = tl.load(b_seq_start_loc + cur_batch) + + CTA_M_LEN = min(BLOCK_M, M_LEN - (pid_m * BLOCK_M)) + CTA_M_OFFSET = M_OFFSET + (pid_m * BLOCK_M) + offset_m = tl.arange(0, BLOCK_M) + ram = CTA_M_OFFSET + tl.max_contiguous( + tl.multiple_of(offset_m % CTA_M_LEN, BLOCK_M), BLOCK_M) + + do_shrink_kernel( + pid_n, + pid_sk, + slice_id, + lora_index, + input_ptr, + lora_ptr, + out_ptr, + N, + K, + CTA_M_LEN, + ram, + # input strides + input_d0_stride, + input_d1_stride, + # lora strides + lora_d0_stride, + lora_d1_stride, + lora_d2_stride, + # output strides + output_d0_stride, + output_d1_stride, + output_d2_stride, + scaling, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + SLICE_NUM) @torch.inference_mode() diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index 1a2282ae9accd..abbac52bac080 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -147,7 +147,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, dtype=torch.long, device=device) - # 5 is the number of indicies tensors. + # 5 is the number of indices tensors. # base_indices, sampler_indices, sampler_indices_padded, # embeddings_indices,long_lora_indices self.indices_len: List[Optional[int]] = [None] * 5 @@ -168,7 +168,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, self.is_prefill = False self.no_lora = False - def _update_base_metadata( + def update_base_metadata( self, mapping: "LoRAMapping", lora_index_to_id: List[Optional[int]], @@ -329,9 +329,9 @@ def update_metadata( long_lora_context: Optional["LongContextLoRAContext"] = None, **kwargs): - self._update_base_metadata(mapping, lora_index_to_id, max_loras, - vocab_size, extra_vocab_size, - long_lora_context) + self.update_base_metadata(mapping, lora_index_to_id, max_loras, + vocab_size, extra_vocab_size, + long_lora_context) if mapping.is_prefill: # Update metadata required for prefill-related operators. self._update_prefill_metada(self.token_lora_indices) diff --git a/vllm/lora/v1/ops/triton_ops/__init__.py b/vllm/lora/v1/ops/triton_ops/__init__.py new file mode 100644 index 0000000000000..f5ca3aa42ea1d --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/__init__.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm.lora.v1.ops.triton_ops.v1_expand import v1_expand +from vllm.lora.v1.ops.triton_ops.v1_shrink import v1_shrink + +__all__ = [ + "v1_shrink", + "v1_expand", +] \ No newline at end of file diff --git a/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_EXPAND_FALSE.json b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_EXPAND_FALSE.json new file mode 100644 index 0000000000000..e22680d308620 --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_EXPAND_FALSE.json @@ -0,0 +1,4643 @@ +{ + "1": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "2": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 32, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 32, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 32, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "3": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 32, + "block_n": 512, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + } +} \ No newline at end of file diff --git a/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_EXPAND_TRUE.json b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_EXPAND_TRUE.json new file mode 100644 index 0000000000000..0e48922e38329 --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_EXPAND_TRUE.json @@ -0,0 +1,4643 @@ +{ + "1": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 16, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "2": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "3": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + } +} \ No newline at end of file diff --git a/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_SHRINK.json b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_SHRINK.json new file mode 100644 index 0000000000000..9a5091996f05c --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_A100_SXM4_80GB_SHRINK.json @@ -0,0 +1,6038 @@ +{ + "1": { + "1": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 512, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "2": { + "1": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "3": { + "1": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + } +} \ No newline at end of file diff --git a/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_EXPAND_FALSE.json b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_EXPAND_FALSE.json new file mode 100644 index 0000000000000..66644fd794ae8 --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_EXPAND_FALSE.json @@ -0,0 +1,4643 @@ +{ + "1": { + "1": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 512, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "2": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "3": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 32, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + } +} \ No newline at end of file diff --git a/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_EXPAND_TRUE.json b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_EXPAND_TRUE.json new file mode 100644 index 0000000000000..453d0bca2ea66 --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_EXPAND_TRUE.json @@ -0,0 +1,4643 @@ +{ + "1": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "2": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 64, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "3": { + "1": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 256, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "16": { + "2048": { + "block_m": 16, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "16": { + "2048": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 64, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 64, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "16": { + "2048": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "16": { + "2048": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 64, + "block_n": 32, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 16, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "16": { + "2048": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "4096": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "6144": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "8192": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "12288": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "15360": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "16384": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "20480": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "23552": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "28672": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + }, + "32768": { + "block_m": 32, + "block_n": 256, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + } +} \ No newline at end of file diff --git a/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_SHRINK.json b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_SHRINK.json new file mode 100644 index 0000000000000..529991d331c8d --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/configs/NVIDIA_H100_80GB_HBM3_SHRINK.json @@ -0,0 +1,6038 @@ +{ + "1": { + "1": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 512, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 512, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 512, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 512, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "2": { + "1": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + }, + "3": { + "1": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "16": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 32, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "32": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "64": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 64, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "128": { + "2048": { + "16": { + "block_m": 16, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "256": { + "2048": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 32, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 256, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 32, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "512": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "1024": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "2048": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "3072": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "4096": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "5120": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "6144": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "7168": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + }, + "8192": { + "2048": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 32, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "4096": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "6144": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 64, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "8192": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "12288": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "15360": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "16384": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "20480": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "23552": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "28672": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + }, + "32768": { + "16": { + "block_m": 64, + "block_n": 16, + "block_k": 128, + "split_k": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": null + } + } + } + } +} \ No newline at end of file diff --git a/vllm/lora/v1/ops/triton_ops/utils.py b/vllm/lora/v1/ops/triton_ops/utils.py new file mode 100644 index 0000000000000..5b2005362972d --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/utils.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 + +import functools +import json +import os +from pathlib import Path +from typing import Dict, Optional + +import torch + + +@functools.lru_cache(maxsize=100) +def load_v1_op_config(op_type: str, + add_inputs: Optional[bool]) -> Optional[Dict]: + # TODO (varun) : op_type should be either 'v1_shrink' or 'v1_expand' + gpu_name = torch.cuda.get_device_name() + gpu_name = gpu_name.replace(' ', '_') + gpu_name = gpu_name.replace('-', '_') + + config_fname = None + if op_type == "shrink": + config_fname = f"{gpu_name}_{op_type.upper()}.json" + else: + config_fname = (f"{gpu_name}_" + f"{op_type.upper()}_" + f"{str(add_inputs).upper()}.json") + + config_path = Path( + f'{os.path.dirname(os.path.realpath(__file__))}/configs/{config_fname}' + ) + if not config_path.exists(): + return None + + # TODO (varun) : It feels like there must be a utility to do this !! + # load json + config_data = None + with open(str(config_path)) as f: + config_data = json.load(f) + return config_data + + +# TODO (varun) : Maybe rename !! Merge with the normal path !! +@functools.lru_cache(maxsize=100) +def get_v1_op_configs(op_type: str, + batch: int, + hidden_size: int, + rank: int, + num_slices: int, + add_inputs: Optional[bool] = None) -> dict[str, int]: + + assert op_type in ["shrink", "expand"] + + # default config + default = {} + if op_type == "shrink": + default = { + 'block_m': 32, + 'block_n': 16, + 'block_k': 256 if batch < 128 else 32, + 'split_k': 64 if batch < 128 else 8, + 'num_warps': 4, + 'num_ctas': 1, + 'num_stages': 2, + 'max_nreg': None + } + else: + default = { + 'block_m': 64, + 'block_n': 128, + 'block_k': 16, + 'num_warps': 4, + 'num_ctas': 1, + 'num_stages': 2, + 'max_nreg': None + } + m = batch + + k, n = (hidden_size, rank) if op_type == "shrink" else (rank, hidden_size) + + config_data = load_v1_op_config(op_type, add_inputs) + if not config_data: + return default + + # config is structured as config_data[num_slices][m][k][n] = {} + # slice by num_slices + config_data = config_data[str(num_slices)] + # slice by m + config_data = config_data.get(str(m)) or config_data[min( + config_data.keys(), key=lambda x: abs(int(x) - m))] + # slice by k + config_data = config_data.get(str(k)) or config_data[min( + config_data.keys(), key=lambda x: abs(int(x) - k))] + # slice by n + config_data = config_data.get(str(n)) or config_data[min( + config_data.keys(), key=lambda x: abs(int(x) - n))] + + assert config_data is not None + return config_data diff --git a/vllm/lora/v1/ops/triton_ops/v1_expand.py b/vllm/lora/v1/ops/triton_ops/v1_expand.py new file mode 100644 index 0000000000000..deb7119a543c0 --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/v1_expand.py @@ -0,0 +1,283 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +from typing import List + +import torch +import triton +import triton.language as tl + +from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel +from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr +from vllm.utils import direct_register_custom_op + +from .utils import get_v1_op_configs + + +@triton.jit +def _v1_expand_kernel( + input_ptr, + lora_ptr, + out_ptr, + M, + N, + K, + token_indices_sorted_by_lora_ids, + num_tokens_per_lora, + lora_token_start_loc, + lora_ids, + slice_start_loc, + input_d0_stride, + input_d1_stride, + input_d2_stride, # 1 + ls_d0_ptr, + ls_d1_ptr, + ls_d2_ptr, # 1 + output_d0_stride, + output_d1_stride, # 1 + output_hs_ptr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + ADD_INPUTS: tl.constexpr, + CAST_TYPE: tl.constexpr, + SLICE_NUM: tl.constexpr, + SAME_STRIDE: tl.constexpr): + + cta_n_num = tl.cdiv(N, BLOCK_N) + cta_m_num = tl.cdiv(M, BLOCK_M) + + pid_mn = tl.program_id(axis=0) + pid_m = pid_mn % cta_m_num + pid_n = (pid_mn // cta_m_num) % cta_n_num + + slice_id = tl.program_id(axis=1) + lora_idx = tl.program_id(axis=2) + + lora_id = tl.load(lora_ids + lora_idx) + if lora_id == -1: + # Early exit for the no-lora case. + return + + lora_m_size = tl.load(num_tokens_per_lora + lora_idx) + + cta_m_offset = pid_m * BLOCK_M + if cta_m_offset >= lora_m_size: + # Early exit CTA. + return + + # When the output dimensions of each slice are the same,cur_n=N, otherwise + # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's + # qkv linear. + curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id) + if pid_n * BLOCK_N > curr_N: + # Early exit CTA. + return + + # num rows this CTA should process. + cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset) + + # Identify all rows that this CTA should process. + lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx) + cta_lora_seq_indices = (token_indices_sorted_by_lora_ids + + lora_m_indices_start + cta_m_offset) + + # Load all relevant row indices. + offset_m = tl.arange(0, BLOCK_M) % cta_m_len + ram = tl.load(cta_lora_seq_indices + offset_m) + + do_expand_kernel( + pid_n, + lora_id, + slice_id, + input_ptr, + lora_ptr, + out_ptr, + N, + K, + cta_m_len, + ram, # array identifying the rows of Input ptr to operate on + slice_start_loc, + # input ptr strides + input_d0_stride, + input_d1_stride, + input_d2_stride, + # lora ptr strides + ls_d0_ptr, + ls_d1_ptr, + ls_d2_ptr, + # out ptr strides + output_d0_stride, + output_d1_stride, + # constants + BLOCK_M, + BLOCK_N, + BLOCK_K, + SAME_STRIDE, + SLICE_NUM, + EVEN_K, + CAST_TYPE, + ADD_INPUTS) + + +@torch.inference_mode() +def _v1_expand( + inputs: torch.Tensor, # shape [num_slices, num_tokens, lora_rank] + lora_b_weights: List[ + torch.Tensor], # shape [num_lora, hidden_size, lora_rank] + output_tensor: torch. + Tensor, # shape [num_tokens, hidden_size * num_slices] + token_lora_mapping: torch.Tensor, # shape [num_tokens] + token_indices_sorted_by_lora_ids: torch.Tensor, # shape [num_tokens] + num_tokens_per_lora: torch.Tensor, # shape [max-loras + 1] + lora_token_start_loc: torch.Tensor, # shape [max-loras + 2] + lora_ids: torch.Tensor, # shape [max-loras + 1] + offset_start: int = 0, + add_inputs: bool = False, +) -> None: + """ + Args: + inputs (torch.Tensor): input tensor + lora_b_weights (List[torch.Tensor]): lora'b weight + output_tensor (torch.Tensor): output tensor + token_lora_mapping (torch.Tensor): A tensor mapping each input token + to the lora-id related to that token. A value of -1 indicates that + LoRA doesn't apply to that token. + token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from + the A matrix grouped by LoRA IDs. + num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number + of tokens that are to be processed by LoRA ID lora_ids[i] + lora_token_start_loc (torch.Tensor): A cumulative sum of + num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that + lora_token_start_loc[i], along with num_tokens_per_lora[i] + identifies the the region in token_indices_sorted_by_lora_ids that + LoRA lora_ids[i] should process. + lora_ids (torch.Tensor): LoRA ids to process. + offset_start (int, optional): Offset start for output_tensor. + Defaults to 0. + add_inputs (bool, optional): Whether to add the input tensor to the + output tensor. Defaults to False. + """ + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + for weight in lora_b_weights: + assert weight.dtype in [torch.float16, torch.bfloat16] + + assert inputs.size(0) == len(lora_b_weights) + assert output_tensor.is_contiguous() + + # metadata sanity check + assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size( + 0) + assert lora_ids.size(0) == num_tokens_per_lora.size(0) + assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1 + + (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor, + lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor, + same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start, + inputs.device) + + K = lora_b_weights[0].shape[-1] # K= rank + M = inputs.size(1) + ADD_INPUTS = add_inputs + MAX_LORAS = lora_ids.size(0) + CAST_TYPE = False + NUM_SLICES = len(lora_b_weights) + + kernel_config = get_v1_op_configs(op_type="expand", + batch=M, + hidden_size=MAX_N, + rank=K, + num_slices=NUM_SLICES, + add_inputs=add_inputs) + BLOCK_M = kernel_config['block_m'] + BLOCK_N = kernel_config['block_n'] + BLOCK_K = kernel_config['block_k'] + EVEN_K = K % BLOCK_K == 0 + + if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [ + torch.float16, + torch.bfloat16, + ]: + CAST_TYPE = True + + # TODO (varun): This grid formulation maximizes parallelization at the + # cost of wasteful thread block launch when only a few input tokens require + # LoRA. This might not be the best in all cases. + grid = ( + triton.cdiv(M, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N), + NUM_SLICES, + # Each LoRA receives its own set of thread blocks for output + # computation. If some LoRA doesn't have any tokens to process, its + # thread blocks simply exit. + MAX_LORAS, + ) + + _v1_expand_kernel[grid]( + inputs, + lora_ptr_tensor, + output_tensor, + M, + MAX_N, + K, + token_indices_sorted_by_lora_ids, + num_tokens_per_lora, + lora_token_start_loc, + lora_ids, + slice_start_tensor, + inputs.stride(0), + inputs.stride(1), + inputs.stride(2), + lora_strides_d0_tensor, + lora_strides_d1_tensor, + lora_strides_d2_tensor, + output_tensor.stride(0), + output_tensor.stride(1), + hidden_sizes_tensor, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + ADD_INPUTS, + CAST_TYPE, + NUM_SLICES, + same_stride, + num_warps=kernel_config['num_warps'], + num_ctas=kernel_config['num_ctas'], + num_stages=kernel_config['num_stages'], + maxnreg=kernel_config['max_nreg'], + ) + return + + +def _v1_expand_fake( + inputs: torch.Tensor, + lora_b_weights: List[torch.Tensor], + output_tensor: torch.Tensor, + token_lora_mapping: torch.Tensor, + token_indices_sorted_by_lora_ids: torch.Tensor, + num_tokens_per_lora: torch.Tensor, + lora_token_start_loc: torch.Tensor, + lora_ids: torch.Tensor, + offset_start: int = 0, + add_inputs: bool = False, +) -> None: + return + + +try: + direct_register_custom_op( + op_name="v1_expand", + op_func=_v1_expand, + mutates_args=["output_tensor"], + fake_impl=_v1_expand_fake, + ) + v1_expand = torch.ops.vllm.v1_expand + +except AttributeError: + v1_expand = _v1_expand diff --git a/vllm/lora/v1/ops/triton_ops/v1_shrink.py b/vllm/lora/v1/ops/triton_ops/v1_shrink.py new file mode 100644 index 0000000000000..a69bbc5752367 --- /dev/null +++ b/vllm/lora/v1/ops/triton_ops/v1_shrink.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +from typing import List + +import torch +import triton +import triton.language as tl + +from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel +from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr +from vllm.utils import direct_register_custom_op + +from .utils import get_v1_op_configs + + +@triton.jit +def _v1_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K, + token_indices_sorted_by_lora_ids, num_tokens_per_lora, + lora_token_start_loc, lora_ids, scaling, input_d0_stride, + input_d1_stride, lora_d0_stride, lora_d1_stride, + lora_d2_stride, output_d0_stride, output_d1_stride, + output_d2_stride, BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, + SLICE_NUM: tl.constexpr): + + cta_n_num = tl.cdiv(N, BLOCK_N) + cta_m_num = tl.cdiv(M, BLOCK_M) + + pid_sk_m_n = tl.program_id(axis=0) + pid_sk = pid_sk_m_n % SPLIT_K + pid_m = (pid_sk_m_n // SPLIT_K) % cta_m_num + pid_n = pid_sk_m_n // (SPLIT_K * cta_m_num) % cta_n_num + + slice_id = tl.program_id(axis=1) + lora_idx = tl.program_id(axis=2) + + lora_id = tl.load(lora_ids + lora_idx) + if lora_id == -1: + # Early exit for the no-lora case. + return + + lora_m_size = tl.load(num_tokens_per_lora + lora_idx) + + cta_m_offset = pid_m * BLOCK_M + if cta_m_offset >= lora_m_size: + # Early exit CTA. + return + + # num rows this CTA should process. + cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset) + + # Identify all rows that this CTA should process. + lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx) + cta_lora_seq_indices = (token_indices_sorted_by_lora_ids + + lora_m_indices_start + cta_m_offset) + + # Load all relevant row indices. + offset_m = tl.arange(0, BLOCK_M) % cta_m_len + ram = tl.load(cta_lora_seq_indices + offset_m) + + do_shrink_kernel( + pid_n, + pid_sk, + slice_id, + lora_id, + input_ptr, + lora_ptr, + out_ptr, + N, + K, + cta_m_len, + ram, # array identifying the rows of Input ptr to operate on + # input strides + input_d0_stride, + input_d1_stride, + # lora strides + lora_d0_stride, + lora_d1_stride, + lora_d2_stride, + # output strides + output_d0_stride, + output_d1_stride, + output_d2_stride, + scaling, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + SLICE_NUM) + + +@torch.inference_mode() +def _v1_shrink( + inputs: torch.Tensor, # shape [num_tokens, hidden_size] + lora_a_weights: List[ + torch.Tensor], # shape [num_loras, lora_rank, hidden_size] + output_tensor: torch.Tensor, # shape [num_slices, num_tokens, lora_rank] + token_lora_mapping: torch.Tensor, # shape [num_tokens] + token_indices_sorted_by_lora_ids: torch.Tensor, # shape [num_tokens] + num_tokens_per_lora: torch.Tensor, # shape [max-loras + 1] + lora_token_start_loc: torch.Tensor, # shape [max-loras + 2] + lora_ids: torch.Tensor, # shape [max-loras + 1] + scaling: float, +) -> None: + """ + Args: + inputs (torch.Tensor): Input tensor + lora_a_weights (List[torch.Tensor]): LoRA weights + output_tensor (torch.Tensor): output tensor + token_lora_mapping (torch.Tensor): A tensor mapping each input token + to the lora-id related to that token. A value of -1 indicates that + LoRA doesn't apply to that token. + token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from + the A matrix grouped by LoRA IDs. + num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number + of tokens that are to be processed by LoRA ID lora_ids[i] + lora_token_start_loc (torch.Tensor): A cumulative sum of + num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that + lora_token_start_loc[i], along with num_tokens_per_lora[i] + identifies the region in token_indices_sorted_by_lora_ids that + LoRA lora_ids[i] should process. + lora_ids (torch.Tensor): LoRA ids to process. + scaling (float): Scaling factor. + """ + assert inputs.dtype == lora_a_weights[0].dtype + assert inputs.dtype in [torch.float16, torch.bfloat16] + for weight in lora_a_weights: + assert weight.dtype in [torch.float16, torch.bfloat16] + + assert inputs.size(1) == lora_a_weights[0].size(-1) + assert inputs.is_contiguous() + assert output_tensor.is_contiguous() + + # metadata sanity check + assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size( + 0) + assert lora_ids.size(0) == num_tokens_per_lora.size(0) + assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1 + + (lora_ptr_tensor, lora_strides_d0, lora_strides_d1, + lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device) + N, K = lora_a_weights[0].shape[-2:] # K=hidden_size,N=rank + M = inputs.size(0) + NUM_SLICES = len(lora_a_weights) + + kernel_config = get_v1_op_configs("shrink", + batch=M, + hidden_size=K, + rank=N, + num_slices=NUM_SLICES) + BLOCK_M = kernel_config['block_m'] + BLOCK_N = kernel_config['block_n'] + BLOCK_K = kernel_config['block_k'] + SPLIT_K = kernel_config['split_k'] + + EVEN_K = K % (BLOCK_K * SPLIT_K) == 0 + MAX_LORAS = lora_ids.size(0) + + # TODO (varun): This grid formulation maximizes parallelization at the + # cost of wasteful thread block launch when only few of the input tokens + # require LoRA. This might not be the best in all cases. + grid = ( + SPLIT_K * triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), + NUM_SLICES, + # Each LoRA receives its own set of thread blocks for output + # computation. If some LoRA doesn't have any tokens to process, its + # thread blocks exit early. + MAX_LORAS, + ) + + _v1_shrink_kernel[grid]( + inputs, + lora_ptr_tensor, + output_tensor, + M, + N, + K, + token_indices_sorted_by_lora_ids, + num_tokens_per_lora, + lora_token_start_loc, + lora_ids, + scaling, + inputs.stride(0), + inputs.stride(1), + lora_strides_d0, + lora_strides_d1, + lora_strides_d2, + output_tensor.stride(0), + output_tensor.stride(1), + output_tensor.stride(2), + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + NUM_SLICES, + num_warps=kernel_config['num_warps'], + num_ctas=kernel_config['num_ctas'], + num_stages=kernel_config['num_stages'], + maxnreg=kernel_config['max_nreg'], + ) + + return + + +def _v1_shrink_fake( + inputs: torch.Tensor, + lora_a_weights: List[torch.Tensor], + output_tensor: torch.Tensor, + token_lora_mapping: torch.Tensor, + token_indices_sorted_by_lora_ids: torch.Tensor, + num_tokens_per_lora: torch.Tensor, + lora_token_start_loc: torch.Tensor, + lora_ids: torch.Tensor, + scaling: float, +) -> None: + return + + +try: + direct_register_custom_op( + op_name="v1_shrink", + op_func=_v1_shrink, + mutates_args=["output_tensor"], + fake_impl=_v1_shrink_fake, + ) + v1_shrink = torch.ops.vllm.v1_shrink + +except AttributeError: + v1_shrink = _v1_shrink diff --git a/vllm/lora/v1/punica_wrapper/punica_gpu_v1.py b/vllm/lora/v1/punica_wrapper/punica_gpu_v1.py new file mode 100644 index 0000000000000..8a83d69dd7095 --- /dev/null +++ b/vllm/lora/v1/punica_wrapper/punica_gpu_v1.py @@ -0,0 +1,371 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +from dataclasses import dataclass +from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final + +import torch + +from vllm.lora.layers import LoRAMapping +from vllm.triton_utils import HAS_TRITON + +if HAS_TRITON: + from vllm.lora.v1.ops.triton_ops import v1_expand + from vllm.lora.v1.ops.triton_ops import v1_shrink + +from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.models import LongContextLoRAContext + + +@dataclass +class V1KernelMeta: + token_lora_mapping: torch.Tensor + token_indices_sorted_by_lora_ids: torch.Tensor + active_lora_ids: torch.Tensor + num_tokens_per_lora: torch.Tensor + lora_token_start_loc: torch.Tensor + + @staticmethod + def make(max_loras: int, max_num_tokens: int, + device: torch.device) -> "V1KernelMeta": + + token_lora_mapping = torch.empty(max_num_tokens, + dtype=torch.int32, + device=device) + + token_indices_sorted_by_lora_ids = torch.empty(max_num_tokens, + dtype=torch.int32, + device=device) + + # +1 because "no-lora" is also a possibility + # example: let max_loras be 3, active_lora_ids of [-1, 0, 2, 1] + # is a possibility. + active_lora_ids = torch.empty(max_loras + 1, + dtype=torch.int32, + device=device) + + # using running example, [3, 10, 5, 2] is a possibility. + num_tokens_per_lora = torch.zeros(max_loras + 1, + dtype=torch.int32, + device=device) + + # +2 for this because, the first index is always 0 + # for example: let max loras be 3, then lora_token_start_loc, + # can be [0, 3, 13, 18, 20]. + lora_token_start_loc = torch.zeros(max_loras + 2, + dtype=torch.int32, + device=device) + return V1KernelMeta( + token_lora_mapping=token_lora_mapping, + token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids, + active_lora_ids=active_lora_ids, + num_tokens_per_lora=num_tokens_per_lora, + lora_token_start_loc=lora_token_start_loc) + + def reset(self): + self.active_lora_ids.fill_(-1) + self.num_tokens_per_lora.fill_(0) + self.lora_token_start_loc.fill_(0) + + def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None: + + self.reset() + + num_tokens = token_lora_mapping.size(0) + + # copy token lora mapping + self.token_lora_mapping[:num_tokens].copy_(token_lora_mapping, + non_blocking=True) + + # token_indices_sorted_by_lora_ids + _, token_indices_sorted_by_lora_ids = torch.sort(token_lora_mapping, + stable=True) + # start gpu transfer + self.token_indices_sorted_by_lora_ids[:num_tokens].copy_( + token_indices_sorted_by_lora_ids, non_blocking=True) + + # active_lora_ids, num_tokens_per_lora + lora_ids, num_tokens_per_lora = torch.unique(token_lora_mapping, + sorted=False, + return_counts=True) + self.active_lora_ids[:lora_ids.size(0)].copy_(lora_ids, + non_blocking=True) + self.num_tokens_per_lora[:num_tokens_per_lora.size(0)].copy_( + num_tokens_per_lora, non_blocking=True) + + # lora_token_start_loc + lora_token_start_loc = torch.cumsum(num_tokens_per_lora, dim=0) + self.lora_token_start_loc[1:1 + lora_token_start_loc.size(0)].copy_( + lora_token_start_loc, non_blocking=True) + + def meta_args( + self, num_tokens: int + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, + torch.Tensor]: + return (self.token_lora_mapping[:num_tokens], + self.token_indices_sorted_by_lora_ids[:num_tokens], + self.num_tokens_per_lora, self.lora_token_start_loc, + self.active_lora_ids) + + +@final +class PunicaWrapperGPUV1(PunicaWrapperBase): + + def __init__(self, max_num_batched_tokens: int, max_batches: int, + device: Union[torch.device, str], **kwargs): + PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, + device) + self.max_loras = kwargs['max_loras'] + self.token_mapping_v1_meta = V1KernelMeta.make(self.max_loras, + max_num_batched_tokens, + device=device) + self.prompt_mapping_v1_meta = V1KernelMeta.make(self.max_loras, + max_batches, + device=device) + + def update_metadata( + self, + mapping: LoRAMapping, + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + **kwargs): + self.update_base_metadata(mapping, lora_index_to_id, max_loras, + vocab_size, extra_vocab_size, + long_lora_context) + self.token_mapping_v1_meta.prepare_tensors(self.token_lora_indices) + self.prompt_mapping_v1_meta.prepare_tensors(self.sampler_indices) + + def _apply_shrink( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: Tuple[torch.Tensor, ...], + scale: float, + ): + # TODO (varun): Handle no_lora case - we can skip the op entirely. + v1_shrink( + x, + w_t_all, + y, + *self.token_mapping_v1_meta.meta_args(x.size(0)), + scale, + ) + + def _apply_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + offset_start: int, + add_inputs: bool, + ): + # TODO (varun): Handle no_lora case - we can skip the op entirely. + v1_expand( + x, + w_t_all, + y, + *self.token_mapping_v1_meta.meta_args(x.size(0)), + offset_start=offset_start, + add_inputs=add_inputs, + ) + + def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, **kwargs): + """ + Performs GEMM for multiple slices of lora_a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + x = x.view(-1, x.shape[-1]) + self._apply_shrink(y, x, lora_a_stacked, scale) + + def add_expand(self, + y: torch.Tensor, + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs) -> None: + """ + Performs GEMM and bias addition for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + + lora_bias_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + bias's weight + output_slices (Tuple[int, ...]): Every slice's size + add_inputs (bool): Defaults to True. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + if lora_bias_stacked is not None: + self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + + # NOTE fused kernel + self._apply_expand(y, x, lora_b_stacked, offset_start, add_inputs=True) + y = y.view_as(y_org) + + def add_lora_embedding(self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + Semantics: + y += x @ lora_b_stacked + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_inputs (bool): Default to True. + """ + v1_expand( + x.unsqueeze(dim=0), + [lora_b_stacked], + y, + *self.token_mapping_v1_meta.meta_args(x.size(1)), + offset_start=0, + add_inputs=add_inputs, + ) + + def add_lora_linear(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + scale: float, + output_slices: Tuple[int, ...], + *, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, + **kwargs) -> None: + """ + Applicable to linear-related lora. + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0)+lora_bias_stacked[i] + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + scale (float): Scaling factor. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. + """ + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + if lora_bias_stacked is not None: + assert len(lora_bias_stacked) == len(output_slices) + y = self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + + if buffer is None: + r = lora_b_stacked[0].size(-1) + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros( + (len(output_slices), x.size(0), r), + dtype=torch.float32, + device=x.device, + ) + self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) + self.add_expand(y, + buffer, + lora_b_stacked, + None, + output_slices, + add_inputs=True, + **kwargs) + + def add_lora_logits(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: Optional[torch.Tensor] = None, + **kwargs) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ + + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + r = lora_b_stacked.size(-1) + num_slices = lora_a_stacked.size(1) + assert num_slices == 1, "lora for logits always has only 1 slice" + + if buffer is None: + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros((num_slices, x.size(0), r), + dtype=torch.float32, + device=x.device) + + v1_shrink(x, [lora_a_stacked], buffer, + *self.prompt_mapping_v1_meta.meta_args(x.size(0)), scale) + + v1_expand(buffer, [lora_b_stacked], + y, + *self.prompt_mapping_v1_meta.meta_args(buffer.size(0)), + add_inputs=True) + y = y.view_as(y_org) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9deb0294668ec..6b34ad07d6aba 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -232,7 +232,10 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype, @classmethod def get_punica_wrapper(cls) -> str: - return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" + if envs.VLLM_USE_V1: + return "vllm.lora.v1.punica_wrapper.punica_gpu_v1.PunicaWrapperGPUV1" # noqa + else: + return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" # NVML utils diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index e7501ad2ea168..7e63d40777dbf 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -63,9 +63,9 @@ def _set_active_loras(self, prompt_lora_mapping: Tuple[int, ...], if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") - # We dont make any distinction between prefills and decodes in the - # scheduler. To that effect, set is_prefill to True so we use the - # sgmv punica kernels always. + # Set is_prefill to True, so we always use the SGMV kernels. + # For cuda platforms, we have specialized triton kernels, and + # the cuda path ignores `is_prefill`. lora_mapping = LoRAMapping(token_lora_mapping, prompt_lora_mapping, is_prefill=True)