Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,19 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]

```

##### 试验功能 -- 编译marlin相关算子

```shell

# 需要从github上克隆tvm_ffi仓库,克隆命令参考
## tvm-ffi commit: 35c99d0ac4cb784862115d0089f60c603acec8f9
git clone https://github.com/apache/tvm-ffi.git --recursive

# 设置TVM_ROOT
export TVM_ROOT=<path-to>/tvm-ffi #用来搜索tvm相关头文件
# 注意,编译gptq_marlin_gemm算子的时候除了指定TVM_ROOT以外,还需要指定cuda_arch
```

2. 编译安装

默认安装路径为 `$HOME/.infini`。
Expand Down
13 changes: 13 additions & 0 deletions include/infinicore/ops/awq_marlin_gemm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

INFINICORE_GRAPH_OP_CLASS(AwqMarlinGemm, Tensor, const Tensor &, const Tensor &, Tensor &, Tensor &, Tensor &, Tensor &, Tensor &, Tensor &, Tensor &, int64_t, bool, bool, bool, bool);

void awq_marlin_gemm_(Tensor c, const Tensor &a, const Tensor &b, Tensor &b_bias, Tensor &b_scales, Tensor &a_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float);
} // namespace infinicore::op
13 changes: 13 additions & 0 deletions include/infinicore/ops/gptq_marlin_gemm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

INFINICORE_GRAPH_OP_CLASS(GptqMarlinGemm, Tensor, const Tensor &, const Tensor &, Tensor &, Tensor &, Tensor &, Tensor &, Tensor &, int64_t, bool, bool, bool, bool);

void gptq_marlin_gemm_(Tensor out, const Tensor &a, const Tensor &b, Tensor &b_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float);
} // namespace infinicore::op
2 changes: 2 additions & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "infiniop/ops/attention.h"
#include "infiniop/ops/avg_pool1d.h"
#include "infiniop/ops/avg_pool3d.h"
#include "infiniop/ops/awq_marlin_gemm.h"
#include "infiniop/ops/axpy.h"
#include "infiniop/ops/binary_cross_entropy_with_logits.h"
#include "infiniop/ops/blas_amax.h"
Expand Down Expand Up @@ -55,6 +56,7 @@
#include "infiniop/ops/gelu.h"
#include "infiniop/ops/gelutanh.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/gptq_marlin_gemm.h"
#include "infiniop/ops/gptq_qyblas_gemm.h"
#include "infiniop/ops/hardswish.h"
#include "infiniop/ops/hardtanh.h"
Expand Down
46 changes: 46 additions & 0 deletions include/infiniop/ops/awq_marlin_gemm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef __INFINIOP_AWQ_MARLIN_GEMM_API_H__
#define __INFINIOP_AWQ_MARLIN_GEMM_API_H__

#include "../operator_descriptor.h"
#include <cstdint>

typedef struct InfiniopDescriptor *infiniopAwqMarlinGemmDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateAwqMarlinGemmDescriptor(infiniopHandle_t handle,
infiniopAwqMarlinGemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t b_bias_desc,
infiniopTensorDescriptor_t b_scales_desc,
infiniopTensorDescriptor_t a_scales_desc,
infiniopTensorDescriptor_t global_scales_desc,
infiniopTensorDescriptor_t b_zeros_desc,
infiniopTensorDescriptor_t g_idx_desc,
infiniopTensorDescriptor_t perm_desc);

__INFINI_C __export infiniStatus_t infiniopGetAwqMarlinGemmWorkspaceSize(infiniopAwqMarlinGemmDescriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopAwqMarlinGemm(infiniopAwqMarlinGemmDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *b_bias,
void *b_scales,
void *a_scales,
void *global_scales,
void *b_zeros,
void *g_idx,
void *perm,
int64_t b_q_type_id,
bool is_k_full,
bool use_atomic_add,
bool use_fp32_reduce,
bool is_zp_float,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyAwqMarlinGemmDescriptor(infiniopAwqMarlinGemmDescriptor_t desc);

#endif
42 changes: 42 additions & 0 deletions include/infiniop/ops/gptq_marlin_gemm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef __INFINIOP_GPTQ_MARLIN_GEMM_API_H__
#define __INFINIOP_GPTQ_MARLIN_GEMM_API_H__

#include "../operator_descriptor.h"
#include <cstdint>

typedef struct InfiniopDescriptor *infiniopGptqMarlinGemmDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateGptqMarlinGemmDescriptor(infiniopHandle_t handle,
infiniopGptqMarlinGemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t b_scales_desc,
infiniopTensorDescriptor_t global_scales_desc,
infiniopTensorDescriptor_t b_zeros_desc,
infiniopTensorDescriptor_t g_idx_desc,
infiniopTensorDescriptor_t perm_desc);

__INFINI_C __export infiniStatus_t infiniopGetGptqMarlinGemmWorkspaceSize(infiniopGptqMarlinGemmDescriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopGptqMarlinGemm(infiniopGptqMarlinGemmDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *a,
const void *b,
void *b_scales,
void *global_scales,
void *b_zeros,
void *g_idx,
void *perm,
int64_t b_q_type_id,
bool is_k_full,
bool use_atomic_add,
bool use_fp32_reduce,
bool is_zp_float,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyGptqMarlinGemmDescriptor(infiniopGptqMarlinGemmDescriptor_t desc);

#endif
21 changes: 21 additions & 0 deletions src/infinicore/ops/awq_marlin_gemm/awq_marlin_gemm.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include "infinicore/ops/awq_marlin_gemm.hpp"

#include "../../utils.hpp"

namespace infinicore::op {

INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(AwqMarlinGemm);

AwqMarlinGemm::AwqMarlinGemm(Tensor c, const Tensor &a, const Tensor &b, Tensor &b_bias, Tensor &b_scales, Tensor &a_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b, b_bias, b_scales, a_scales, global_scales, b_zeros, g_idx, perm);
INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b, b_bias, b_scales, a_scales, global_scales, b_zeros, g_idx, perm, b_q_type_id, is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float);
}
void AwqMarlinGemm::execute(Tensor c, const Tensor &a, const Tensor &b, Tensor &b_bias, Tensor &b_scales, Tensor &a_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(AwqMarlinGemm, c, a, b, b_bias, b_scales, a_scales, global_scales, b_zeros, g_idx, perm, b_q_type_id, is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float);
}

void awq_marlin_gemm_(Tensor c, const Tensor &a, const Tensor &b, Tensor &b_bias, Tensor &b_scales, Tensor &a_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
AwqMarlinGemm::execute(c, a, b, b_bias, b_scales, a_scales, global_scales, b_zeros, g_idx, perm, b_q_type_id, is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float);
}

} // namespace infinicore::op
82 changes: 82 additions & 0 deletions src/infinicore/ops/awq_marlin_gemm/awq_marlin_gemm_infiniop.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/awq_marlin_gemm.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>

namespace infinicore::op::awq_marlin_gemm_impl::infiniop {

INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, AwqMarlinGemm, 100);

struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, c, a, b, b_bias, b_scales, a_scales, global_scales, b_zeros, g_idx, perm;
int64_t b_q_type_id;
bool is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float;
};

void *plan(Tensor c, const Tensor &a, const Tensor &b, Tensor &b_bias, Tensor &b_scales, Tensor &a_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
size_t seed = hash_combine(c, a, b, b_bias, b_scales, a_scales, global_scales, b_zeros, g_idx, perm);

INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, AwqMarlinGemm,
seed,
c->desc(), a->desc(),
b->desc(), b_bias->desc(), b_scales->desc(), a_scales->desc(), global_scales->desc(), b_zeros->desc(), g_idx->desc(), perm->desc());

INFINIOP_WORKSPACE_TENSOR(workspace, AwqMarlinGemm, descriptor);

return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(c),
graph::GraphTensor(a),
graph::GraphTensor(b),
graph::GraphTensor(b_bias),
graph::GraphTensor(b_scales),
graph::GraphTensor(a_scales),
graph::GraphTensor(global_scales),
graph::GraphTensor(b_zeros),
graph::GraphTensor(g_idx),
graph::GraphTensor(perm),
b_q_type_id,
is_k_full,
use_atomic_add,
use_fp32_reduce,
is_zp_float};
}

void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

INFINICORE_CHECK_ERROR(infiniopAwqMarlinGemm(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->c->data(),
planned->a->data(),
planned->b->data(),
planned->b_bias->data(),
planned->b_scales->data(),
planned->a_scales->data(),
planned->global_scales->data(),
planned->b_zeros->data(),
planned->g_idx->data(),
planned->perm->data(),
planned->b_q_type_id,
planned->is_k_full,
planned->use_atomic_add,
planned->use_fp32_reduce,
planned->is_zp_float,
context::getStream()));
}

void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}

INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(AwqMarlinGemm, &plan, &run, &cleanup);

} // namespace infinicore::op::awq_marlin_gemm_impl::infiniop
21 changes: 21 additions & 0 deletions src/infinicore/ops/gptq_marlin_gemm/gptq_marlin_gemm.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include "infinicore/ops/gptq_marlin_gemm.hpp"

#include "../../utils.hpp"

namespace infinicore::op {

INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(GptqMarlinGemm);

GptqMarlinGemm::GptqMarlinGemm(Tensor out, const Tensor &a, const Tensor &b, Tensor &b_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b, b_scales, global_scales, b_zeros, g_idx, perm);
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, a, b, b_scales, global_scales, b_zeros, g_idx, perm, b_q_type_id, is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float);
}
void GptqMarlinGemm::execute(Tensor out, const Tensor &a, const Tensor &b, Tensor &b_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(GptqMarlinGemm, out, a, b, b_scales, global_scales, b_zeros, g_idx, perm, b_q_type_id, is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float);
}

void gptq_marlin_gemm_(Tensor out, const Tensor &a, const Tensor &b, Tensor &b_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
GptqMarlinGemm::execute(out, a, b, b_scales, global_scales, b_zeros, g_idx, perm, b_q_type_id, is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float);
}

} // namespace infinicore::op
78 changes: 78 additions & 0 deletions src/infinicore/ops/gptq_marlin_gemm/gptq_marlin_gemm_infiniop.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/gptq_marlin_gemm.hpp"
#include <infiniop.h>

namespace infinicore::op::gptq_marlin_gemm_impl::infiniop {

INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, GptqMarlinGemm, 100);

struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, out, a, b, b_scales, global_scales, b_zeros, g_idx, perm;
int64_t b_q_type_id;
bool is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float;
};

void *plan(Tensor out, const Tensor &a, const Tensor &b, Tensor &b_scales, Tensor &global_scales, Tensor &b_zeros, Tensor &g_idx, Tensor &perm, int64_t b_q_type_id, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
size_t seed = hash_combine(out, a, b, b_scales, global_scales, b_zeros, g_idx, perm);

INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, GptqMarlinGemm,
seed,
out->desc(), a->desc(),
b->desc(), b_scales->desc(), global_scales->desc(), b_zeros->desc(), g_idx->desc(), perm->desc());

INFINIOP_WORKSPACE_TENSOR(workspace, GptqMarlinGemm, descriptor);

return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(out),
graph::GraphTensor(a),
graph::GraphTensor(b),
graph::GraphTensor(b_scales),
graph::GraphTensor(global_scales),
graph::GraphTensor(b_zeros),
graph::GraphTensor(g_idx),
graph::GraphTensor(perm),
b_q_type_id,
is_k_full,
use_atomic_add,
use_fp32_reduce,
is_zp_float};
}

void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

INFINICORE_CHECK_ERROR(infiniopGptqMarlinGemm(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->out->data(),
planned->a->data(),
planned->b->data(),
planned->b_scales->data(),
planned->global_scales->data(),
planned->b_zeros->data(),
planned->g_idx->data(),
planned->perm->data(),
planned->b_q_type_id,
planned->is_k_full,
planned->use_atomic_add,
planned->use_fp32_reduce,
planned->is_zp_float,
context::getStream()));
}

void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}

INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(GptqMarlinGemm, &plan, &run, &cleanup);

} // namespace infinicore::op::gptq_marlin_gemm_impl::infiniop
Loading
Loading