diff --git a/include/hidet/runtime/callbacks.h b/include/hidet/runtime/callbacks.h
index 3a104ac66..84fe7064c 100644
--- a/include/hidet/runtime/callbacks.h
+++ b/include/hidet/runtime/callbacks.h
@@ -9,6 +9,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
+
 #include <cstdint>
 #include <hidet/runtime/common.h>
 
diff --git a/include/hidet/runtime/cpu/bfloat16.h b/include/hidet/runtime/cpu/bfloat16.h
index 864f41100..44dc93f55 100644
--- a/include/hidet/runtime/cpu/bfloat16.h
+++ b/include/hidet/runtime/cpu/bfloat16.h
@@ -88,7 +88,7 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 */
-
+#pragma once
 #include <cmath>
 #include <cstring>
 #include <stdint.h>
diff --git a/include/hidet/runtime/cpu/complex.h b/include/hidet/runtime/cpu/complex.h
index d1f3bd57e..9e7cad01a 100644
--- a/include/hidet/runtime/cpu/complex.h
+++ b/include/hidet/runtime/cpu/complex.h
@@ -9,6 +9,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
+
 #include <complex>
 
 typedef std::complex<float> complex64_t;
diff --git a/include/hidet/runtime/cpu/float16.h b/include/hidet/runtime/cpu/float16.h
index 42bea52f8..011380edb 100644
--- a/include/hidet/runtime/cpu/float16.h
+++ b/include/hidet/runtime/cpu/float16.h
@@ -89,6 +89,8 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 */
 
+#pragma once
+
 #include <cmath>
 #include <cstring>
 #include <stdint.h>
diff --git a/include/hidet/runtime/cpu/float32.h b/include/hidet/runtime/cpu/float32.h
index 762336418..a2de27e72 100644
--- a/include/hidet/runtime/cpu/float32.h
+++ b/include/hidet/runtime/cpu/float32.h
@@ -9,6 +9,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+#pragma once
+
 #include <math.h>
 
 static inline float rsqrtf(float x)
diff --git a/python/hidet/graph/ops/__init__.py b/python/hidet/graph/ops/__init__.py
index 85e583309..5f31cc48c 100644
--- a/python/hidet/graph/ops/__init__.py
+++ b/python/hidet/graph/ops/__init__.py
@@ -10,7 +10,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=redefined-builtin
-from .matmul import batch_matmul, matmul, matmul_x86, matmul_cublas
+from .matmul import batch_matmul, matmul, batch_matmul_x86, matmul_cublas
 from .conv1d import conv1d, conv1d_gemm
 from .conv1d_transpose import conv1d_transpose
 from .conv2d import conv2d, conv2d_channel_last, conv2d_winograd, conv2d_gemm, conv2d_gemm_fp16
diff --git a/python/hidet/graph/ops/matmul/__init__.py b/python/hidet/graph/ops/matmul/__init__.py
index ae3f4c217..d5ea6210b 100644
--- a/python/hidet/graph/ops/matmul/__init__.py
+++ b/python/hidet/graph/ops/matmul/__init__.py
@@ -16,4 +16,4 @@
 
 
 from .matmul_f32_x86 import Matmulx86Op, MatmulF32Taskx86
-from .matmul_f32_x86 import matmul_x86
+from .matmul_f32_x86 import batch_matmul_x86
diff --git a/python/hidet/graph/ops/matmul/matmul_f32_x86.py b/python/hidet/graph/ops/matmul/matmul_f32_x86.py
index eeb467b30..7126fd210 100644
--- a/python/hidet/graph/ops/matmul/matmul_f32_x86.py
+++ b/python/hidet/graph/ops/matmul/matmul_f32_x86.py
@@ -82,7 +82,7 @@ def implement_cpu(self, working_dir: str) -> Union[IRModule, List[IRModule]]:
         return tune.extract_ir_modules(self.schedule_matmulf32_x86)
 
     @tune.space(1, MC=[2016], NC=[256, 384, 512], KC=[384, 512, 560], ways=[(1, 4, 2, 1)])
-    def schedule_matmulf32_x86(self, MC=2016, NC=384, KC=560, ways=(1, 4, 2, 1)) -> IRModule:
+    def schedule_matmulf32_x86(self, MC=2016, NC=384, KC=560, ways=(1, 1, 1, 1)) -> IRModule:
         import hidet
         from hidet.ir.type import tensor_type
         from hidet.lang import tensor, grid, as_tensor_pointer
@@ -858,5 +858,5 @@ def __init__(self, a: Tensor, b: Tensor):
         super().__init__(inputs=[a, b], attributes={}, task=task)
 
 
-def matmul_x86(a: Tensor, b: Tensor) -> Tensor:
+def batch_matmul_x86(a: Tensor, b: Tensor) -> Tensor:
     return Matmulx86Op(a, b).outputs[0]
diff --git a/tests/operators/test_matmul.py b/tests/operators/test_matmul.py
index c5c67aa50..00b09de72 100644
--- a/tests/operators/test_matmul.py
+++ b/tests/operators/test_matmul.py
@@ -26,7 +26,7 @@ def test_matmul_x86(a_shape, b_shape):
         a_shape,
         b_shape,
         lambda x, y: np.matmul(x, y),
-        lambda x, y: ops.matmul_x86(x, y) - ops.matmul_x86(x, y) + ops.matmul_x86(x, y),
+        lambda x, y: ops.batch_matmul_x86(x, y) - ops.batch_matmul_x86(x, y) + ops.batch_matmul_x86(x, y),
         dtype="float32",
         atol=1e-4,
         rtol=1e-4,