Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 60e0fa0

Browse files
authored
Merge branch 'main' into wangchang/lm-eval
2 parents e74eb85 + c263d09 commit 60e0fa0

File tree

6 files changed

+50
-29
lines changed

6 files changed

+50
-29
lines changed

docs/publication.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
Full Publications/Events (50)
22
==========
33

4-
## 2024 (11)
4+
## 2024 (12)
5+
* Blog published on digit.in: [AI hallucination in LLM and beyond: Will it ever be fixed?](https://www.digit.in/features/general/ai-hallucination-in-llm-and-beyond-can-it-be-fixed-completely.html)
56
* Blog published on Medium: [Accelerating Qwen2 Models with Intel Extension for Transformers](https://medium.com/intel-analytics-software/accelerating-qwen2-models-with-intel-extension-for-transformers-99403de82f68) (June 2024)
67
* Blog published on Huggingface: [Building Cost-Efficient Enterprise RAG applications with Intel Gaudi 2 and Intel Xeon](https://huggingface.co/blog/cost-efficient-rag-applications-with-intel) (May 2024)
78
* Blog published on Intel Developer News: [Efficient Natural Language Embedding Models with Intel® Extension for Transformers](https://www.intel.com/content/www/us/en/developer/articles/technical/efficient-natural-language-embedding-models.html) (May 2024)

intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ langchain_core==0.1.35
99
langid
1010
markdown
1111
openpyxl
12+
protobuf==3.20
1213
PyMuPDF
1314
python-docx
1415
qdrant-client==1.9.0

intel_extension_for_transformers/qbits/dispatcher/include/bestla_weightonly_dispatcher.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
#pragma once
1515
#include <ATen/core/TensorBody.h>
1616
#include <torch/torch.h>
17-
#include "bestla/bestla_storage.h"
1817
#include "../include/dispatcher_utils.hpp"
1918
#include <string.h>
2019
#include <assert.h>

intel_extension_for_transformers/qbits/dispatcher/include/dispatcher_utils.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <chrono>
1717
#include <string>
1818
#include "bestla/bestla_device.h"
19+
#include "bestla/bestla_storage.h"
1920
#include "bestla/bestla_utils.h"
2021
#include "bestla/bestla_parallel.h"
2122
namespace dispatcher_utils {
@@ -26,6 +27,12 @@ inline bool check_avx_vnni() { return bestla::device::CpuDevice::getInstance()->
2627
inline bool check_avx512f() { return bestla::device::CpuDevice::getInstance()->AVX512F(); }
2728
inline bool check_avx2() { return bestla::device::CpuDevice::getInstance()->AVX2(); }
2829

30+
template <class GemmCore>
31+
constexpr bool is_int8_cmpt_gemmcore() {
32+
return GemmCore::ISA == BTLA_ISA::AMX_INT8 || GemmCore::ISA == BTLA_ISA::AVX512_VNNI ||
33+
GemmCore::ISA == BTLA_ISA::AVX_VNNI || std::is_same_v<GemmCore, bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>>;
34+
}
35+
2936
class qbits_threading {
3037
public:
3138
static bestla::parallel::IThreading* get() {

intel_extension_for_transformers/qbits/dispatcher/src/bestla_packq_impl.cpp

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,19 @@
1616
#include "../include/bestla_packq_impl.hpp"
1717

1818
namespace woq {
19-
template <class GemmCore, BTLA_ISA ISA>
19+
20+
template <class proB>
2021
void execute_qpack(repack_quantized_weight_param* p, repack_quantized_weight_ctx* ctx, WOQ_TASK task) {
21-
using proB = bestla::prologue_b::gemm::WeightKBlockNInteger<GemmCore, ISA>;
2222
static proB ker;
23-
auto qpackw = ker.createStorage(ctx->n, ctx->k, p->blocksize, wei2bestladt_map.at(p->weight_type),
24-
scale2bestladt_map.at(p->scale_type), BTLA_DTYPE::BF16, p->asym);
23+
using WType = typename proB::StorageWeight;
24+
WType qpackw(0);
25+
if constexpr (std::is_same_v<WType, bestla::storage::gemm::StorageWeightKBlockNInteger>) {
26+
qpackw = ker.createStorage(ctx->n, ctx->k, p->blocksize, wei2bestladt_map.at(p->weight_type),
27+
scale2bestladt_map.at(p->scale_type), BTLA_DTYPE::BF16, p->asym);
28+
} else {
29+
qpackw = ker.createStorage(ctx->n, ctx->k, p->blocksize, wei2bestladt_map.at(p->weight_type),
30+
scale2bestladt_map.at(p->scale_type));
31+
}
2532
if (p->enable_act_shuffle) ker.enableShuffle(&qpackw);
2633
ctx->packw_size = qpackw.mSize;
2734
if (task == WOQ_GET_PACKW_SIZE) return;
@@ -33,6 +40,20 @@ void execute_qpack(repack_quantized_weight_param* p, repack_quantized_weight_ctx
3340
p->asym ? ctx->zp->data_ptr<int8_t>() : nullptr, &qpackw, dispatcher_utils::qbits_threading::get());
3441
}
3542

43+
template <class GemmCore, BTLA_ISA ISA>
44+
void parse_prob(repack_quantized_weight_param* p, repack_quantized_weight_ctx* ctx, WOQ_TASK task) {
45+
if (p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
46+
p->weight_type == "int2_clip") {
47+
return execute_qpack<bestla::prologue_b::gemm::WeightKBlockNInteger<GemmCore, ISA>>(p, ctx, task);
48+
}
49+
if (p->weight_type == "nf4" || p->weight_type == "fp4_e2m1_bnb" || p->weight_type == "fp4_e2m1") {
50+
TORCH_CHECK(!p->asym, "Qbits: float-weight unsupports asym quantization.");
51+
return execute_qpack<bestla::prologue_b::gemm::WeightKBlockNFloat<GemmCore, ISA>>(p, ctx, task);
52+
}
53+
TORCH_CHECK(false, "Qbits: unsupported bestla packq config, compute_type: " + p->compute_type +
54+
" weight_type: " + p->weight_type);
55+
}
56+
3657
std::string get_dtype_str(BTLA_DTYPE dtype) {
3758
switch (dtype) {
3859
case BTLA_DTYPE::F32:
@@ -183,40 +204,38 @@ torch::Tensor get_packw_info(torch::Tensor& packw, PACKW_ACQUIRE_TYPE ACQ_T) {
183204
}
184205

185206
void bestla_packq(repack_quantized_weight_param* p, repack_quantized_weight_ctx* ctx, WOQ_TASK task) {
186-
// TODO(zhe): elegant impl.
187-
TORCH_CHECK(p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
188-
p->weight_type == "int2_clip",
189-
"Qbits: only support Integer WOQ in PACKQ");
190-
191207
if (p->compute_type == "int8") {
208+
TORCH_CHECK(p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
209+
p->weight_type == "int2_clip",
210+
"Qbits: only support Integer weight-type with int8 compute-type");
192211
if (dispatcher_utils::check_amx() && p->blocksize % bestla::gemm::ICoreRowNAmxint8KBlock<64, 16>::KTILE == 0) {
193-
return execute_qpack<bestla::gemm::ICoreRowNAmxint8KBlock<64, 16>, BTLA_ISA::AMX_INT8>(p, ctx, task);
212+
return parse_prob<bestla::gemm::ICoreRowNAmxint8KBlock<64, 16>, BTLA_ISA::AMX_INT8>(p, ctx, task);
194213
}
195214
if (dispatcher_utils::check_avx512_vnni() &&
196215
p->blocksize % bestla::gemm::ICoreRowNAvx512vnniKBlock<48, 4>::KTILE == 0) {
197-
return execute_qpack<bestla::gemm::ICoreRowNAvx512vnniKBlock<48, 4>, BTLA_ISA::AVX512_VNNI>(p, ctx, task);
216+
return parse_prob<bestla::gemm::ICoreRowNAvx512vnniKBlock<48, 4>, BTLA_ISA::AVX512_VNNI>(p, ctx, task);
198217
}
199218
if (dispatcher_utils::check_avx_vnni() && p->blocksize % bestla::gemm::ICoreRowNAvxvnniKBlock<24, 2>::KTILE == 0) {
200-
return execute_qpack<bestla::gemm::ICoreRowNAvxvnniKBlock<24, 2>, BTLA_ISA::AVX_VNNI>(p, ctx, task);
219+
return parse_prob<bestla::gemm::ICoreRowNAvxvnniKBlock<24, 2>, BTLA_ISA::AVX_VNNI>(p, ctx, task);
201220
}
202221
if (dispatcher_utils::check_avx2() && p->blocksize % bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>::KTILE == 0) {
203-
return execute_qpack<bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>, BTLA_ISA::AVX2>(p, ctx, task);
222+
return parse_prob<bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>, BTLA_ISA::AVX2>(p, ctx, task);
204223
}
205224
TORCH_CHECK(false, "Qbits: Illegal config in int8 compute_type, blocksize:", p->blocksize,
206225
", ISA support avx2:", dispatcher_utils::check_avx2());
207226
}
208227
if (p->compute_type == "fp32") {
209228
if (dispatcher_utils::check_avx512f()) {
210-
return execute_qpack<bestla::gemm::SCoreRowNAvx512f<48, 8>, BTLA_ISA::AVX512F>(p, ctx, task);
229+
return parse_prob<bestla::gemm::SCoreRowNAvx512f<48, 8>, BTLA_ISA::AVX512F>(p, ctx, task);
211230
}
212231
if (dispatcher_utils::check_avx2()) {
213-
return execute_qpack<bestla::gemm::SCoreRowNAvx2<24, 4>, BTLA_ISA::AVX2>(p, ctx, task);
232+
return parse_prob<bestla::gemm::SCoreRowNAvx2<24, 4>, BTLA_ISA::AVX2>(p, ctx, task);
214233
}
215234
TORCH_CHECK(false, "Qbits: device ISA must support BTLA_ISA::AVX2 when compute_type==fp32");
216235
}
217236
if (p->compute_type == "bf16") {
218237
if (dispatcher_utils::check_amx()) {
219-
return execute_qpack<bestla::gemm::HCoreRowNAmxbf16<64, 16>, BTLA_ISA::AMX_BF16>(p, ctx, task);
238+
return parse_prob<bestla::gemm::HCoreRowNAmxbf16<64, 16>, BTLA_ISA::AMX_BF16>(p, ctx, task);
220239
}
221240
TORCH_CHECK(false, "Qbits: device ISA must support AMX-BF16 when compute_type==bf16");
222241
}

intel_extension_for_transformers/qbits/dispatcher/src/bestla_weightonly_dispatcher.cpp

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,6 @@ concept quant_PrologueA = requires {
4343
requires !std::is_same_v<T, bestla::utils::bf16>;
4444
};
4545

46-
template <class GemmCore>
47-
constexpr bool is_int8_cmpt_gemmcore() {
48-
return GemmCore::ISA == BTLA_ISA::AMX_INT8 || GemmCore::ISA == BTLA_ISA::AVX512_VNNI ||
49-
GemmCore::ISA == BTLA_ISA::AVX_VNNI || std::is_same_v<GemmCore, bestla::gemm::ICoreRowNAvx2vnniKBlock<24, 2>>;
50-
}
51-
5246
template <class Launcher>
5347
void dequantize_packed_weight(woq_config_param* p, woq_runtime_ctx* ctx) {
5448
if (dispatcher_utils::initer.verbose) dispatcher_utils::timer.start();
@@ -133,7 +127,7 @@ void do_compute(woq_config_param* p, woq_runtime_ctx* ctx, ParamA param_a) {
133127
using StorageWeight = typename Launcher::PrologueB::StorageWeight;
134128
size_t asym_size = 0, shuf_size = 0;
135129
int8_t* tmpbuf = nullptr;
136-
if constexpr (is_int8_cmpt_gemmcore<GemmCore>()) {
130+
if constexpr (dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>()) {
137131
using Parallel = bestla::parallel::gemm::SchedulerKBlockS<GemmCore>;
138132
bestla::utils::GemmProblem gp(1, ctx->m, ctx->n, ctx->k, p->blocksize);
139133
StorageWeight* packedw = dynamic_cast<StorageWeight*>(ctx->deseries_wei);
@@ -236,7 +230,7 @@ void execute_task(woq_config_param* p, woq_runtime_ctx* ctx) {
236230
template <WOQ_TASK TASK, class GemmCore, template <class _T, BTLA_ISA> class PrologueB,
237231
template <class _T, BTLA_ISA> class PrologueA, template <BTLA_ISA> class Epilogue>
238232
void parse_launcher(woq_config_param* p, woq_runtime_ctx* ctx) {
239-
if constexpr (is_int8_cmpt_gemmcore<GemmCore>()) {
233+
if constexpr (dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>()) {
240234
using Launcher = bestla::wrapper::gemm::LauncherIntKBlock<GemmCore::ISA, GemmCore, PrologueA, PrologueB, Epilogue>;
241235
return execute_task<TASK, Launcher>(p, ctx);
242236
} else {
@@ -260,7 +254,7 @@ template <WOQ_TASK TASK, class GemmCore, template <class _T, BTLA_ISA> class Pro
260254
void parse_activation(woq_config_param* p, woq_runtime_ctx* ctx) {
261255
using namespace bestla::prologue_a::gemm;
262256
if (p->src_dt == dispatcher_utils::QBITS_FP32) {
263-
if constexpr (is_int8_cmpt_gemmcore<GemmCore>()) {
257+
if constexpr (dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>()) {
264258
return parse_store<TASK, GemmCore, PrologueB, ShuffleActivationKBlockQuantizeF32, dispatcher_utils::QBITS_FP32>(
265259
p, ctx);
266260
} else {
@@ -269,7 +263,7 @@ void parse_activation(woq_config_param* p, woq_runtime_ctx* ctx) {
269263
}
270264
}
271265
if (p->src_dt == dispatcher_utils::QBITS_BF16) {
272-
if constexpr (is_int8_cmpt_gemmcore<GemmCore>()) {
266+
if constexpr (dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>()) {
273267
return parse_store<TASK, GemmCore, PrologueB, ShuffleActivationKBlockQuantizeBf16, dispatcher_utils::QBITS_BF16>(
274268
p, ctx);
275269
} else {
@@ -289,7 +283,7 @@ void parse_weight(woq_config_param* p, woq_runtime_ctx* ctx) {
289283
if (p->weight_type == "nf4" || p->weight_type == "fp4_e2m1_bnb" || p->weight_type == "fp4_e2m1" ||
290284
p->weight_type == "fp8_e4m3" || p->weight_type == "fp8_e5m2") {
291285
TORCH_CHECK(!p->asym, "Qbits: float-weight unsupports asym quantization.");
292-
if constexpr (!is_int8_cmpt_gemmcore<GemmCore>())
286+
if constexpr (!dispatcher_utils::is_int8_cmpt_gemmcore<GemmCore>())
293287
return parse_activation<TASK, GemmCore, WeightKBlockNFloat>(p, ctx);
294288
}
295289
TORCH_CHECK(false,

0 commit comments

Comments
 (0)