Spaces:
Running
Running
Neo Zhang Jianyu
commited on
Commit
·
8614863
1
Parent(s):
691c071
change the reorder tensor from init to execute OP (llama/13003)
Browse files
ggml/src/ggml-sycl/common.hpp
CHANGED
|
@@ -313,7 +313,6 @@ struct ggml_backend_sycl_context {
|
|
| 313 |
int device;
|
| 314 |
std::string name;
|
| 315 |
optimize_feature opt_feature;
|
| 316 |
-
bool optimized_graph=false;
|
| 317 |
|
| 318 |
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
| 319 |
|
|
|
|
| 313 |
int device;
|
| 314 |
std::string name;
|
| 315 |
optimize_feature opt_feature;
|
|
|
|
| 316 |
|
| 317 |
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
| 318 |
|
ggml/src/ggml-sycl/ggml-sycl.cpp
CHANGED
|
@@ -192,7 +192,7 @@ static void ggml_check_sycl() try {
|
|
| 192 |
|
| 193 |
if (!initialized) {
|
| 194 |
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
| 195 |
-
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT",
|
| 196 |
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
| 197 |
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
| 198 |
GGML_LOG_INFO("Running with Environment Variables:\n");
|
|
@@ -2852,6 +2852,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
| 2852 |
}
|
| 2853 |
}
|
| 2854 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2855 |
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 2856 |
|
| 2857 |
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
|
@@ -2914,6 +2972,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2914 |
// KQ + KQV multi-batch
|
| 2915 |
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
| 2916 |
} else if (use_dequantize_mul_mat_vec) {
|
|
|
|
| 2917 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
| 2918 |
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
|
| 2919 |
} else if (use_mul_mat_vec_q) {
|
|
@@ -2921,6 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
| 2921 |
} else if (use_mul_mat_q) {
|
| 2922 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
|
| 2923 |
} else {
|
|
|
|
| 2924 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
| 2925 |
}
|
| 2926 |
}
|
|
@@ -3545,71 +3605,8 @@ catch (sycl::exception const &exc) {
|
|
| 3545 |
std::exit(1);
|
| 3546 |
}
|
| 3547 |
|
| 3548 |
-
static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
| 3549 |
-
size_t size, size_t offset, dpct::queue_ptr stream) {
|
| 3550 |
-
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
| 3551 |
-
SYCL_CHECK(
|
| 3552 |
-
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
| 3553 |
-
.wait()));
|
| 3554 |
-
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
| 3555 |
-
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
| 3556 |
-
int offset_blks = offset / sizeof(block_q4_0);
|
| 3557 |
-
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
|
| 3558 |
-
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
| 3559 |
-
|
| 3560 |
-
stream->parallel_for(
|
| 3561 |
-
size / sizeof(block_q4_0),
|
| 3562 |
-
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 3563 |
-
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
| 3564 |
-
const int ib = i;
|
| 3565 |
-
|
| 3566 |
-
for (int j = 0; j < QK4_0/2; j ++)
|
| 3567 |
-
{
|
| 3568 |
-
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
| 3569 |
-
}
|
| 3570 |
-
*(d_ptr + ib) = x[ib].d;
|
| 3571 |
-
});
|
| 3572 |
-
|
| 3573 |
-
sycl::free(tmp_buf, *stream);
|
| 3574 |
-
}
|
| 3575 |
-
|
| 3576 |
-
static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
| 3577 |
-
char*data_device = (char*)src0->data;
|
| 3578 |
-
size_t ncols = src0->ne[0];
|
| 3579 |
-
size_t nrows = src0->ne[1];
|
| 3580 |
-
size_t size = ggml_nbytes(src0);
|
| 3581 |
-
|
| 3582 |
-
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
| 3583 |
-
}
|
| 3584 |
-
|
| 3585 |
-
static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
| 3586 |
-
ggml_tensor *src0 = dst->src[0];
|
| 3587 |
-
ggml_tensor *src1 = dst->src[1];
|
| 3588 |
-
|
| 3589 |
-
if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
|
| 3590 |
-
src1->ne[2]==1 && src1->ne[3]==1) {
|
| 3591 |
-
reorder_qw(src0, stream);
|
| 3592 |
-
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
|
| 3593 |
-
GGML_ASSERT(extra);
|
| 3594 |
-
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
|
| 3595 |
-
}
|
| 3596 |
-
}
|
| 3597 |
-
|
| 3598 |
-
static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
|
| 3599 |
-
dpct::queue_ptr stream = ctx->stream();
|
| 3600 |
-
if (ctx->optimized_graph) {
|
| 3601 |
-
return;
|
| 3602 |
-
}
|
| 3603 |
-
ctx->optimized_graph = true;
|
| 3604 |
-
|
| 3605 |
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 3606 |
-
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
|
| 3607 |
-
}
|
| 3608 |
-
}
|
| 3609 |
-
|
| 3610 |
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
|
| 3611 |
ggml_sycl_set_main_device(sycl_ctx->device);
|
| 3612 |
-
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
|
| 3613 |
|
| 3614 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 3615 |
ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
| 192 |
|
| 193 |
if (!initialized) {
|
| 194 |
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
| 195 |
+
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
| 196 |
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
| 197 |
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
| 198 |
GGML_LOG_INFO("Running with Environment Variables:\n");
|
|
|
|
| 2852 |
}
|
| 2853 |
}
|
| 2854 |
|
| 2855 |
+
static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
| 2856 |
+
size_t size, size_t offset, dpct::queue_ptr stream) {
|
| 2857 |
+
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
| 2858 |
+
SYCL_CHECK(
|
| 2859 |
+
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
| 2860 |
+
.wait()));
|
| 2861 |
+
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
| 2862 |
+
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
| 2863 |
+
int offset_blks = offset / sizeof(block_q4_0);
|
| 2864 |
+
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
|
| 2865 |
+
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
| 2866 |
+
|
| 2867 |
+
stream->parallel_for(
|
| 2868 |
+
size / sizeof(block_q4_0),
|
| 2869 |
+
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
| 2870 |
+
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
| 2871 |
+
const int ib = i;
|
| 2872 |
+
|
| 2873 |
+
for (int j = 0; j < QK4_0/2; j ++)
|
| 2874 |
+
{
|
| 2875 |
+
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
| 2876 |
+
}
|
| 2877 |
+
*(d_ptr + ib) = x[ib].d;
|
| 2878 |
+
});
|
| 2879 |
+
|
| 2880 |
+
sycl::free(tmp_buf, *stream);
|
| 2881 |
+
}
|
| 2882 |
+
|
| 2883 |
+
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
| 2884 |
+
char*data_device = (char*)src0->data;
|
| 2885 |
+
size_t ncols = src0->ne[0];
|
| 2886 |
+
size_t nrows = src0->ne[1];
|
| 2887 |
+
size_t size = ggml_nbytes(src0);
|
| 2888 |
+
|
| 2889 |
+
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
| 2890 |
+
}
|
| 2891 |
+
|
| 2892 |
+
/*
|
| 2893 |
+
* This function could be called when the OP (mul_mat) function support reorder optimizition.
|
| 2894 |
+
*/
|
| 2895 |
+
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
|
| 2896 |
+
ggml_tensor * dst) {
|
| 2897 |
+
if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
|
| 2898 |
+
ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
|
| 2899 |
+
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
|
| 2900 |
+
src0->type == GGML_TYPE_Q4_0 &&
|
| 2901 |
+
src1->ne[2]==1 && src1->ne[3]==1) {
|
| 2902 |
+
|
| 2903 |
+
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
|
| 2904 |
+
if (!extra) return; //only happen in CI/UT permute case.
|
| 2905 |
+
|
| 2906 |
+
if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
|
| 2907 |
+
|
| 2908 |
+
reorder_qw(src0, ctx->stream());
|
| 2909 |
+
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
|
| 2910 |
+
}
|
| 2911 |
+
}
|
| 2912 |
+
|
| 2913 |
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 2914 |
|
| 2915 |
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
|
|
|
| 2972 |
// KQ + KQV multi-batch
|
| 2973 |
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
| 2974 |
} else if (use_dequantize_mul_mat_vec) {
|
| 2975 |
+
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
|
| 2976 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
| 2977 |
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
|
| 2978 |
} else if (use_mul_mat_vec_q) {
|
|
|
|
| 2980 |
} else if (use_mul_mat_q) {
|
| 2981 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
|
| 2982 |
} else {
|
| 2983 |
+
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
|
| 2984 |
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
| 2985 |
}
|
| 2986 |
}
|
|
|
|
| 3605 |
std::exit(1);
|
| 3606 |
}
|
| 3607 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3608 |
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
|
| 3609 |
ggml_sycl_set_main_device(sycl_ctx->device);
|
|
|
|
| 3610 |
|
| 3611 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 3612 |
ggml_tensor * node = cgraph->nodes[i];
|