From 73968c329657bd4a78b2d4e57b39c936779cf7e5 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 26 Dec 2025 13:12:26 +0800
Subject: [PATCH] issue/843: success per_channel_quant_int8

---
 .../ops/quant/per_channel_quant_int8.h        |  28 ++
 .../per_channel_quant_int8/cuda/kernel.cuh    | 277 ++++++++++++++
 .../ops/quant/per_channel_quant_int8/info.h   |  59 +++
 .../nvidia/per_channel_quant_int8_nvidia.cu   | 118 ++++++
 .../nvidia/per_channel_quant_int8_nvidia.cuh  |   7 +
 .../quant/per_channel_quant_int8/operator.cc  |  98 +++++
 .../per_channel_quant_int8.h                  |  40 ++
 test/infiniop/libinfiniop/op_register.py      |  35 ++
 test/infiniop/per_channel_quant_int8.py       | 194 ++++++++++
 test/infiniop/w8a8_per_channel.py             | 347 ++++++++++++++++++
 xmake.lua                                     |   2 +-
 xmake/nvidia.lua                              |   2 +-
 12 files changed, 1205 insertions(+), 2 deletions(-)
 create mode 100644 include/infiniop/ops/quant/per_channel_quant_int8.h
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/info.h
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
 create mode 100644 src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
 create mode 100644 test/infiniop/per_channel_quant_int8.py
 create mode 100644 test/infiniop/w8a8_per_channel.py

diff --git a/include/infiniop/ops/quant/per_channel_quant_int8.h b/include/infiniop/ops/quant/per_channel_quant_int8.h
new file mode 100644
index 000000000..ce21f4556
--- /dev/null
+++ b/include/infiniop/ops/quant/per_channel_quant_int8.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;
+
+__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
+                                                                      infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
+                                                                      infiniopTensorDescriptor_t x_packed_desc,
+                                                                      infiniopTensorDescriptor_t x_scale_desc,
+                                                                      infiniopTensorDescriptor_t x_zero_desc,
+                                                                      infiniopTensorDescriptor_t x_desc);
+
+__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
+                                                      void *workspace,
+                                                      size_t workspace_size,
+                                                      void *x_packed,
+                                                      void *x_scale,
+                                                      void *x_zero,
+                                                      const void *x,
+                                                      void *stream);
+
+__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
new file mode 100644
index 000000000..a3cbdbe01
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
@@ -0,0 +1,277 @@
+#ifndef __PERCHANNEL_QUANTINT8_KERNEL_CUH__
+#define __PERCHANNEL_QUANTINT8_KERNEL_CUH__
+
+#include <cub/block/block_reduce.cuh>
+__device__ inline int round_half_away_from_zero(float x) {
+    float ax = fabsf(x);
+    float r = floorf(ax + 0.5f);
+    return (x >= 0.0f) ? (int)r : -(int)r;
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void blockPerChannelQuantI8Kernel(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x,
+    int M, int K) {
+    int row = blockIdx.x;
+    int tid = row * K;
+
+    // ---- 1. reduce max ----
+    float local_max = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(
+        x + tid, K);
+
+    __shared__ float global_max_f;
+    if (threadIdx.x == 0) {
+        global_max_f = local_max;
+    }
+    __syncthreads();
+
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    // ---- 2. reduce min ----
+    float thread_min = __FLT_MAX__;
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+        thread_min = fminf(thread_min, (float)x[tid + ind]);
+    }
+    float local_min = BlockReduce(temp_storage).Reduce(thread_min, cub::Min());
+
+    __shared__ float global_min_f;
+    if (threadIdx.x == 0) {
+        global_min_f = local_min;
+    }
+    __syncthreads();
+
+    // ---- 3. 使用 float（匹配 python）计算 scale/zero ----
+    float global_max = global_max_f;
+    float global_min = global_min_f;
+
+    float scale = (global_max - global_min) / 255.0f;
+    if (scale < 1e-8f) {
+        scale = 1e-8f;
+    }
+
+    float inv_scale = 1.0f / scale;
+    float zero = -global_min * inv_scale - 128.0f;
+
+    // 写回 scale, zero
+    x_scale[row] = (Tdata)scale;
+    x_zero[row] = (Tdata)zero;
+
+    // ---- 4. 使用 float + half-away-from-zero（与 Python 完全一致）----
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+
+        float v = (float)x[tid + ind];
+        float qf = v * inv_scale + zero;
+
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -128) {
+            q = -128;
+        }
+
+        x_packed[tid + ind] = (int8_t)q;
+    }
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void blockPerChannelQuantI8SymKernel(
+    int8_t *x_packed, float *x_scale, const Tdata *x,
+    int M, int K) {
+    int row = blockIdx.x;
+    int tid = row * K;
+
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    // ---- 2. reduce min ----
+    float thread_max = -__FLT_MAX__;
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+        thread_max = fmaxf(thread_max, fabs((float)x[tid + ind]));
+    }
+    float local_max = BlockReduce(temp_storage).Reduce(thread_max, cub::Max());
+
+    __shared__ float global_max_f;
+    if (threadIdx.x == 0) {
+        global_max_f = local_max;
+    }
+    __syncthreads();
+
+    // ---- 3. 使用 float（匹配 python）计算 scale/zero ----
+    float global_max = global_max_f;
+
+    float scale = global_max / 127.0f;
+    if (scale < 1e-8f) {
+        scale = 1e-8f;
+    }
+
+    float inv_scale = 1.0f / scale;
+
+    // 写回 scale, zero
+    x_scale[row] = (Tdata)scale;
+
+    // ---- 4. 使用 float + half-away-from-zero（与 Python 完全一致）----
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+
+        float v = (float)x[tid + ind];
+        float qf = v * inv_scale;
+
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -127) {
+            q = -127;
+        }
+
+        x_packed[tid + ind] = (int8_t)q;
+    }
+}
+
+template <typename T>
+struct MaxOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return max(a, b);
+    }
+};
+template <typename T>
+struct MinOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return min(a, b);
+    }
+};
+template <template <typename> class ReductionOp, typename T,
+          int thread_group_width>
+__inline__ __device__ T WarpAllReduce(T val) {
+    for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+        val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
+    }
+    return val;
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+__device__ void warpPerChannelQuantI8Kernel(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x,
+    int M, int K) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int tid = otherIdx * K;
+
+    if (otherIdx < M) {
+
+        __shared__ float max_total[BLOCK_SIZE_y];
+        __shared__ float min_total[BLOCK_SIZE_y];
+
+        float max_data = -__FLT_MAX__;
+        float min_data = __FLT_MAX__;
+
+        // ---- reduce max/min ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            max_data = fmaxf(max_data, v);
+            min_data = fminf(min_data, v);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
+        min_data = WarpAllReduce<MinOp, float, BLOCK_SIZE_x>(min_data);
+
+        if (threadIdx.x == 0) {
+            max_total[threadIdx.y] = max_data;
+            min_total[threadIdx.y] = min_data;
+        }
+        __syncthreads();
+
+        // ---- float scale/zero（与 Python float32 匹配）----
+        float max_f = max_total[threadIdx.y];
+        float min_f = min_total[threadIdx.y];
+
+        float scale = (max_f - min_f) / 255.0f;
+        if (scale < 1e-8f) {
+            scale = 1e-8f;
+        }
+
+        float inv_scale = 1.0f / scale;
+        float zero = -min_f * inv_scale - 128.0f;
+
+        x_scale[otherIdx] = scale;
+        x_zero[otherIdx] = zero;
+
+        // ---- float + half-away-from-zero 量化 ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            float qf = v * inv_scale + zero;
+
+            int q = round_half_away_from_zero(qf);
+
+            if (q > 127) {
+                q = 127;
+            }
+            if (q < -128) {
+                q = -128;
+            }
+
+            x_packed[tid + ind] = (int8_t)q;
+        }
+    }
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+__device__ void warpPerChannelQuantI8SymKernel(
+    int8_t *x_packed, float *x_scale, const Tdata *x,
+    int M, int K) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int tid = otherIdx * K;
+
+    if (otherIdx < M) {
+
+        __shared__ float max_total[BLOCK_SIZE_y];
+
+        float max_data = -__FLT_MAX__;
+
+        // ---- reduce max/min ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = fabs((float)x[tid + ind]);
+            max_data = fmaxf(max_data, v);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
+
+        if (threadIdx.x == 0) {
+            max_total[threadIdx.y] = max_data;
+        }
+        __syncthreads();
+
+        // ---- float scale/zero（与 Python float32 匹配）----
+        float max_f = max_total[threadIdx.y];
+
+        float scale = max_f / 127.0f;
+        if (scale < 1e-8f) {
+            scale = 1e-8f;
+        }
+
+        float inv_scale = 1.0f / scale;
+
+        x_scale[otherIdx] = scale;
+
+        // ---- float + half-away-from-zero 量化 ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            float qf = v * inv_scale;
+
+            int q = round_half_away_from_zero(qf);
+
+            if (q > 127) {
+                q = 127;
+            }
+            if (q < -127) {
+                q = -127;
+            }
+
+            x_packed[tid + ind] = (int8_t)q;
+        }
+    }
+}
+
+#endif // __PERCHANNEL_QUANTINT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/info.h b/src/infiniop/ops/quant/per_channel_quant_int8/info.h
new file mode 100644
index 000000000..6a8295753
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/info.h
@@ -0,0 +1,59 @@
+#ifndef __PER_CHANNEL_QUANT_INT8_INFO_H__
+#define __PER_CHANNEL_QUANT_INT8_INFO_H__
+
+#include "../../../../utils.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::per_channel_quant_int8 {
+
+class PerChannelQuantI8Info {
+private:
+    PerChannelQuantI8Info() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t M, K;
+
+    static utils::Result<PerChannelQuantI8Info> createPerChannelQuantI8Info(
+        infiniopTensorDescriptor_t x_packed_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc,
+        infiniopTensorDescriptor_t x_desc) {
+
+        CHECK_OR_RETURN(
+            x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = x_desc->dtype();
+        const infiniDtype_t packed_type = x_packed_desc->dtype();
+
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 2
+                            && x_packed_desc->ndim() == 2
+                            && x_scale_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t M = x_desc->dim(0);
+        size_t K = x_desc->dim(1);
+
+        CHECK_OR_RETURN(M == x_packed_desc->dim(0)
+                            || K == x_packed_desc->dim(1)
+                            || M == x_scale_desc->dim(0)
+                            || 1 == x_scale_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<PerChannelQuantI8Info>(PerChannelQuantI8Info{
+            dtype,
+            packed_type,
+            M,
+            K,
+        });
+    }
+};
+
+} // namespace op::per_channel_quant_int8
+
+#endif //  __PER_CHANNEL_QUANT_INT8_INFO_H__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu
new file mode 100644
index 000000000..24fa80c6e
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cu
@@ -0,0 +1,118 @@
+#include "../../../../devices/nvidia/nvidia_common.cuh"
+#include "per_channel_quant_int8_nvidia.cuh"
+
+#include "../../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL blockPerChannelQuantI8(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
+    blockPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL blockPerChannelQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
+    blockPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x, M, K);
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_CUDA_KERNEL warpPerChannelQuantI8(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
+    warpPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_CUDA_KERNEL warpPerChannelQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
+    warpPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x, M, K);
+}
+
+namespace op::per_channel_quant_int8::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_packed_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto info = PerChannelQuantI8Info::createPerChannelQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t per_channel_quant_int8Kernel(const PerChannelQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, cudaStream_t stream) {
+    int M = (int)info.M;
+    int K = (int)info.K;
+
+    if (K >= 1024) {
+        if (x_zero == nullptr) {
+            blockPerChannelQuantI8Sym<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            blockPerChannelQuantI8<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+
+    } else {
+        constexpr unsigned int BLOCK_SIZE_x = 32;
+        constexpr unsigned int BLOCK_SIZE_y = 32;
+        int num_block_x = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+        dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+        if (x_zero == nullptr) {
+            warpPerChannelQuantI8Sym<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            warpPerChannelQuantI8<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *x_packed, void *x_scale, void *x_zero, const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define QUANT(BLOCK_SIZE, TDATA) \
+    per_channel_quant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, stream)
+#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                \
+        if (_info.dtype == INFINI_DTYPE_F16)         \
+            return QUANT(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)    \
+            return QUANT(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)   \
+            return QUANT(BLOCK_SIZE, __nv_bfloat16); \
+        else                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;   \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::per_channel_quant_int8::nvidia
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh
new file mode 100644
index 000000000..9a7a2872d
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/nvidia/per_channel_quant_int8_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
+#define __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
+#include "../per_channel_quant_int8.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __PER_CHANNEL_QUANT_INT8_NVIDIA_API_H__
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc b/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
new file mode 100644
index 000000000..dade91c88
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
@@ -0,0 +1,98 @@
+#include "../../../operator.h"
+#include "../../../handle.h"
+#include "infiniop/ops/quant/per_channel_quant_int8.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/per_channel_quant_int8_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
+                                                             infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t x_packed_desc,
+                                                             infiniopTensorDescriptor_t x_scale_desc,
+                                                             infiniopTensorDescriptor_t x_zero_desc,
+                                                             infiniopTensorDescriptor_t x_desc) {
+#define CREATE(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                                \
+        return op::per_channel_quant_int8::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                           \
+            reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_packed_desc,                                                                    \
+            x_scale_desc,                                                                     \
+            x_zero_desc,                                                                      \
+            x_desc);
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size) {
+    switch (desc->device_type) {
+#define GET(CASE, NAMESPACE)                                                                                     \
+    case CASE:                                                                                                   \
+        *size = reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
+                                             void *workspace,
+                                             size_t workspace_size,
+                                             void *x_packed,
+                                             void *x_scale,
+                                             void *x_zero,
+                                             const void *x,
+                                             void *stream) {
+#define QUANT(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                         \
+        return reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, x_packed, x_scale, x_zero, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        QUANT(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        QUANT(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef QUANT
+}
+
+__C infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                              \
+        delete reinterpret_cast<op::per_channel_quant_int8::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
new file mode 100644
index 000000000..fc3bdbc1f
--- /dev/null
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/per_channel_quant_int8.h
@@ -0,0 +1,40 @@
+#ifndef __QUANT_H__
+#define __QUANT_H__
+
+#include "../../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                \
+                                                                                             \
+    namespace op::per_channel_quant_int8::NAMESPACE {                                        \
+    class Descriptor final : public InfiniopDescriptor {                                     \
+        struct Opaque;                                                                       \
+        Opaque *_opaque;                                                                     \
+        PerChannelQuantI8Info _info;                                                         \
+        size_t _workspace_size;                                                              \
+                                                                                             \
+        Descriptor(Opaque *opaque, PerChannelQuantI8Info info,                               \
+                   size_t workspace_size,                                                    \
+                   infiniDevice_t device_type, int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id},                                    \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}               \
+                                                                                             \
+    public:                                                                                  \
+        ~Descriptor();                                                                       \
+                                                                                             \
+        size_t minWorkspaceSize() const { return _workspace_size; }                          \
+                                                                                             \
+        static infiniStatus_t create(                                                        \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                  \
+            infiniopTensorDescriptor_t x_packed_desc,                                        \
+            infiniopTensorDescriptor_t x_scale_desc,                                         \
+            infiniopTensorDescriptor_t x_zero_desc,                                          \
+            infiniopTensorDescriptor_t x_desc);                                              \
+                                                                                             \
+        infiniStatus_t calculate(                                                            \
+            void *workspace, size_t workspace_size,                                          \
+            void *x_packed, void *x_scale, void *x_zero, const void *x, void *stream) const; \
+    };                                                                                       \
+    }
+
+#endif // __QUANT_H__
\ No newline at end of file
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 89f0dacd3..7d9e0e165 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -686,6 +686,41 @@ def dequantize_(lib):
     ]
 
 
+@OpRegister.operator
+def per_channel_quant_int8_(lib):
+    lib.infiniopCreatePerChannelQuantI8Descriptor.restype = c_int32
+    lib.infiniopCreatePerChannelQuantI8Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetPerChannelQuantI8WorkspaceSize.restype = c_int32
+    lib.infiniopGetPerChannelQuantI8WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPerChannelQuantI8.restype = c_int32
+    lib.infiniopPerChannelQuantI8.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPerChannelQuantI8Descriptor.restype = c_int32
+    lib.infiniopDestroyPerChannelQuantI8Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
 @OpRegister.operator
 def softplus_(lib):
     lib.infiniopCreateSoftplusDescriptor.restype = c_int32
diff --git a/test/infiniop/per_channel_quant_int8.py b/test/infiniop/per_channel_quant_int8.py
new file mode 100644
index 000000000..f017c8408
--- /dev/null
+++ b/test/infiniop/per_channel_quant_int8.py
@@ -0,0 +1,194 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # x_shape, w_shape, symmetric, bias_exit, y_shape
+    ((8, 8), True),
+    ((128, 512), True),
+    ((128, 128), True),
+    ((256, 1024), False),
+    ((256, 2048), True),
+    ((1024, 2048), False),
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def per_token_quant_int8_torch(x, symmetric):
+    if symmetric:
+        x = x.float()
+        absmax = x.abs().max(dim=-1).values
+        absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
+        scale_x = absmax / 127
+        x_q = x.mul(127 / absmax)
+        x_q = torch.round(x_q).to(torch.int8)
+
+        return x_q, scale_x, None
+    else:
+        w = x.float()
+        w_min = w.min(dim=-1, keepdim=True)[0]
+        w_max = w.max(dim=-1, keepdim=True)[0]
+
+        # 避免除以零
+        w_scale = (w_max - w_min) / 255.0
+        w_scale = torch.clamp(w_scale, min=1e-8)
+
+        # 计算zero point
+        w_zero = -w_min / w_scale - 128.0
+
+        # 计算量化值
+        w_q = torch.round(w / w_scale + w_zero)
+
+        # 限制范围[-128, 127]
+        w_q = torch.clamp(w_q, -128, 127)
+
+        # 转为int8
+        w_packed = w_q.to(torch.int8)
+
+        return w_packed, w_scale, w_zero
+
+def test(
+    handle,
+    device,
+    x_shape,
+    symmetric,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    
+    print(
+        f"Testing Per Channel Quant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric} , dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+   
+    x = TestTensor(x_shape, None, dtype, device)
+    x_p, x_s, x_z = per_token_quant_int8_torch(x.torch_tensor(), symmetric)
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
+    x_scale = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePerChannelQuantI8Descriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPerChannelQuantI8WorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+    
+    def lib_per_channel_quant_int8():
+        check_error(
+            LIBINFINIOP.infiniopPerChannelQuantI8(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_per_channel_quant_int8()
+    
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x_packed.actual_tensor(), x_p, atol=atol, rtol=rtol)
+        debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+        if symmetric == False:
+            debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
+    
+    if symmetric:
+        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and 
+                torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol))
+    else:
+        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and 
+                torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) and
+                torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol))
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: per_token_quant_int8_torch(x.torch_tensor(), symmetric), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_per_channel_quant_int8(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyPerChannelQuantI8Descriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/w8a8_per_channel.py b/test/infiniop/w8a8_per_channel.py
new file mode 100644
index 000000000..51d43ce18
--- /dev/null
+++ b/test/infiniop/w8a8_per_channel.py
@@ -0,0 +1,347 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # x_shape, w_shape, symmetric, bias_exit, y_shape
+    ((128, 512), (512, 1024), True, False, (128, 1024)),
+    ((128, 128), (128, 128), False, True, (128, 128)),
+    ((256, 1024), (1024, 2048), True, False, (256, 2048)),
+    ((256, 2048), (2048, 1024), False, True, (256, 1024)),
+    ((1024, 2048), (2048, 4096), True, False, (1024, 4096)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def linearFunction(bias, x, w):
+    if bias is not None:
+        ans = (
+           torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype)
+            + bias
+        )
+    else:
+        ans = (
+            torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype)
+        )
+    return ans
+def per_token_quant_int8_torch(x, symmetric):
+    if symmetric:
+        x = x.float()
+        absmax = x.abs().max(dim=-1).values
+        absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
+        scale_x = absmax / 127
+        x_q = x.mul(127 / absmax)
+        x_q = torch.round(x_q).to(torch.int8)
+
+        return x_q, scale_x, None
+    else:
+        w = x.float()
+        w_min = w.min(dim=-1, keepdim=True)[0]
+        w_max = w.max(dim=-1, keepdim=True)[0]
+
+        # 避免除以零
+        w_scale = (w_max - w_min) / 255.0
+        w_scale = torch.clamp(w_scale, min=1e-8)
+
+        # 计算zero point
+        w_zero = -w_min / w_scale - 128.0
+
+        # 计算量化值
+        w_q = torch.round(w / w_scale + w_zero)
+
+        # 限制范围[-128, 127]
+        w_q = torch.clamp(w_q, -128, 127)
+
+        # 转为int8
+        w_packed = w_q.to(torch.int8)
+
+        return w_packed, w_scale, w_zero
+    
+def computeQuant(
+        handle,
+        device,
+        x, 
+        symmetric,
+        sync=None,
+):
+    x_shape = x.shape
+    M, K = x_shape
+
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
+    x_scale = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), None, InfiniDtype.F32, device)
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePerChannelQuantI8Descriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPerChannelQuantI8WorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+    
+    def lib_per_channel_quant_int8():
+        check_error(
+            LIBINFINIOP.infiniopPerChannelQuantI8(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_per_channel_quant_int8()
+    
+    if sync is not None:
+        sync()
+    check_error(LIBINFINIOP.infiniopDestroyPerChannelQuantI8Descriptor(descriptor))
+    if symmetric:
+        return x_packed.actual_tensor(), x_scale.actual_tensor(), None
+    else:
+        return x_packed.actual_tensor(), x_scale.actual_tensor(), x_zero.actual_tensor()
+
+def test(
+    handle,
+    device,
+    x_shape,
+    w_shape,
+    symmetric,
+    bias_exit,
+    y_shape,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    if symmetric == False:
+        return
+    print(
+        f"Testing W8A8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, bias:{bias_exit}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+    N = w_shape[1]
+    if bias_exit:
+        bias = TestTensor((N,), None, dtype, device)
+    else:
+        bias = None
+    x = TestTensor(x_shape, None, dtype, device)
+    w = TestTensor(w_shape, None, dtype, device)
+    y = TestTensor(y_shape, None, dtype, device)
+    if inplace == Inplace.INPLACE:
+        d = y
+    else:
+        d = TestTensor(y_shape, None, dtype, device)
+    ans = linearFunction(
+        bias.torch_tensor() if bias_exit else None,
+        x.torch_tensor(),
+        w.torch_tensor()
+    )
+    w_data_t = w.actual_tensor().clone().t().contiguous()
+    w_t = TestTensor((N, K), w_data_t.stride(), dtype, device, mode="manual", set_tensor=w_data_t)
+    w_packed, w_scale, w_zero = per_token_quant_int8_torch(w_data_t, symmetric)
+    # w_packed, w_scale, w_zero = computeQuant(
+    #     handle,
+    #     device,
+    #     w_t, 
+    #     symmetric,
+    #     sync=None)
+    
+    weights = TestTensor(
+        w_shape, w_packed.t().contiguous().stride(), InfiniDtype.I8, device, mode="manual", set_tensor=w_packed.t().contiguous()
+    )
+    weights_scale = TestTensor(
+        (1, N), w_scale.t().contiguous().stride(), InfiniDtype.F32, device, mode="manual", set_tensor=w_scale.t().contiguous()
+    )
+    if symmetric:
+        weights_zero = None
+    else:
+        weights_zero = TestTensor(
+            (1, N), w_zero.t().contiguous().stride(), InfiniDtype.F32, device, mode="manual", set_tensor=w_zero.t().contiguous()
+        )
+    x_p, x_s, x_z = per_token_quant_int8_torch(x.actual_tensor(), symmetric)
+    # x_p, x_s, x_z = computeQuant(
+    #     handle,
+    #     device,
+    #     x, 
+    #     symmetric,
+    #     sync=None)
+    
+    x_packed = TestTensor(
+        x_shape, x_p.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=x_p
+    )
+    x_scale = TestTensor((M, 1), x_s.stride(), InfiniDtype.F32, device, mode="manual", set_tensor=x_s)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), x_z.stride(), InfiniDtype.F32, device, mode="manual", set_tensor=x_z)
+    
+    
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateI8GemmDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            bias.descriptor if bias_exit else None,
+            x_packed.descriptor,
+            x_scale.descriptor,
+            weights.descriptor,
+            weights_scale.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    
+    y.destroy_desc()
+    
+    if bias_exit:
+        bias.destroy_desc()
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+    weights.destroy_desc()
+    weights_scale.destroy_desc()
+    if symmetric == False:
+        weights_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetI8GemmWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x_packed.device)
+
+    def lib_linear():
+        check_error(
+            LIBINFINIOP.infiniopI8Gemm(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                bias.data() if bias_exit else None,
+                x_packed.data(),
+                x_scale.data(),
+                weights.data(),
+                weights_scale.data(),
+                None,
+            )
+        )
+
+    lib_linear()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    print(max(abs(y.actual_tensor() - ans).flatten()))
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: linearFunction(bias.torch_tensor() if bias_exit else None, x.torch_tensor(), w.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyI8GemmDescriptor(descriptor))
+    
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/xmake.lua b/xmake.lua
index eae4b4b14..b73a38cac 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -302,7 +302,7 @@ target("infiniop")
     end
     set_languages("cxx17")
     add_files("src/infiniop/devices/handle.cc")
-    add_files("src/infiniop/ops/*/operator.cc")
+    add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
     add_files("src/infiniop/*.cc")
 
     set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index db575969b..635ee96d1 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -69,7 +69,7 @@ target("infiniop-nvidia")
     end
 
     set_languages("cxx17")
-    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c")