Neo Zhang Jianyu commited on
Commit
8614863
·
1 Parent(s): 691c071

change the reorder tensor from init to execute OP (llama/13003)

Browse files
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -313,7 +313,6 @@ struct ggml_backend_sycl_context {
313
  int device;
314
  std::string name;
315
  optimize_feature opt_feature;
316
- bool optimized_graph=false;
317
 
318
  queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
319
 
 
313
  int device;
314
  std::string name;
315
  optimize_feature opt_feature;
 
316
 
317
  queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
318
 
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -192,7 +192,7 @@ static void ggml_check_sycl() try {
192
 
193
  if (!initialized) {
194
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
195
- g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
196
  g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
197
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
198
  GGML_LOG_INFO("Running with Environment Variables:\n");
@@ -2852,6 +2852,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2852
  }
2853
  }
2854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2855
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2856
 
2857
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
@@ -2914,6 +2972,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
2914
  // KQ + KQV multi-batch
2915
  ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
2916
  } else if (use_dequantize_mul_mat_vec) {
 
2917
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
2918
  // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
2919
  } else if (use_mul_mat_vec_q) {
@@ -2921,6 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
2921
  } else if (use_mul_mat_q) {
2922
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
2923
  } else {
 
2924
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
2925
  }
2926
  }
@@ -3545,71 +3605,8 @@ catch (sycl::exception const &exc) {
3545
  std::exit(1);
3546
  }
3547
 
3548
- static void reorder_qw(char *data_device, const int ncols, const int nrows,
3549
- size_t size, size_t offset, dpct::queue_ptr stream) {
3550
- auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
3551
- SYCL_CHECK(
3552
- CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
3553
- .wait()));
3554
- GGML_ASSERT((size % sizeof(block_q4_0) == 0));
3555
- GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
3556
- int offset_blks = offset / sizeof(block_q4_0);
3557
- auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
3558
- auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
3559
-
3560
- stream->parallel_for(
3561
- size / sizeof(block_q4_0),
3562
- [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
3563
- const block_q4_0* x = (const block_q4_0*)tmp_buf;
3564
- const int ib = i;
3565
-
3566
- for (int j = 0; j < QK4_0/2; j ++)
3567
- {
3568
- *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
3569
- }
3570
- *(d_ptr + ib) = x[ib].d;
3571
- });
3572
-
3573
- sycl::free(tmp_buf, *stream);
3574
- }
3575
-
3576
- static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3577
- char*data_device = (char*)src0->data;
3578
- size_t ncols = src0->ne[0];
3579
- size_t nrows = src0->ne[1];
3580
- size_t size = ggml_nbytes(src0);
3581
-
3582
- reorder_qw(data_device, ncols, nrows, size, 0, stream);
3583
- }
3584
-
3585
- static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3586
- ggml_tensor *src0 = dst->src[0];
3587
- ggml_tensor *src1 = dst->src[1];
3588
-
3589
- if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
3590
- src1->ne[2]==1 && src1->ne[3]==1) {
3591
- reorder_qw(src0, stream);
3592
- ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
3593
- GGML_ASSERT(extra);
3594
- extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
3595
- }
3596
- }
3597
-
3598
- static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3599
- dpct::queue_ptr stream = ctx->stream();
3600
- if (ctx->optimized_graph) {
3601
- return;
3602
- }
3603
- ctx->optimized_graph = true;
3604
-
3605
- for (int i = 0; i < cgraph->n_nodes; i++) {
3606
- if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
3607
- }
3608
- }
3609
-
3610
  static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
3611
  ggml_sycl_set_main_device(sycl_ctx->device);
3612
- if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
3613
 
3614
  for (int i = 0; i < cgraph->n_nodes; i++) {
3615
  ggml_tensor * node = cgraph->nodes[i];
 
192
 
193
  if (!initialized) {
194
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
195
+ g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
196
  g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
197
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
198
  GGML_LOG_INFO("Running with Environment Variables:\n");
 
2852
  }
2853
  }
2854
 
2855
+ static void reorder_qw(char *data_device, const int ncols, const int nrows,
2856
+ size_t size, size_t offset, dpct::queue_ptr stream) {
2857
+ auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
2858
+ SYCL_CHECK(
2859
+ CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
2860
+ .wait()));
2861
+ GGML_ASSERT((size % sizeof(block_q4_0) == 0));
2862
+ GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
2863
+ int offset_blks = offset / sizeof(block_q4_0);
2864
+ auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
2865
+ auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
2866
+
2867
+ stream->parallel_for(
2868
+ size / sizeof(block_q4_0),
2869
+ [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
2870
+ const block_q4_0* x = (const block_q4_0*)tmp_buf;
2871
+ const int ib = i;
2872
+
2873
+ for (int j = 0; j < QK4_0/2; j ++)
2874
+ {
2875
+ *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
2876
+ }
2877
+ *(d_ptr + ib) = x[ib].d;
2878
+ });
2879
+
2880
+ sycl::free(tmp_buf, *stream);
2881
+ }
2882
+
2883
+ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
2884
+ char*data_device = (char*)src0->data;
2885
+ size_t ncols = src0->ne[0];
2886
+ size_t nrows = src0->ne[1];
2887
+ size_t size = ggml_nbytes(src0);
2888
+
2889
+ reorder_qw(data_device, ncols, nrows, size, 0, stream);
2890
+ }
2891
+
2892
+ /*
2893
+ * This function could be called when the OP (mul_mat) function support reorder optimizition.
2894
+ */
2895
+ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
2896
+ ggml_tensor * dst) {
2897
+ if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
2898
+ ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
2899
+ dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
2900
+ src0->type == GGML_TYPE_Q4_0 &&
2901
+ src1->ne[2]==1 && src1->ne[3]==1) {
2902
+
2903
+ ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
2904
+ if (!extra) return; //only happen in CI/UT permute case.
2905
+
2906
+ if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
2907
+
2908
+ reorder_qw(src0, ctx->stream());
2909
+ extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
2910
+ }
2911
+ }
2912
+
2913
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2914
 
2915
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
 
2972
  // KQ + KQV multi-batch
2973
  ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
2974
  } else if (use_dequantize_mul_mat_vec) {
2975
+ opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
2976
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
2977
  // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
2978
  } else if (use_mul_mat_vec_q) {
 
2980
  } else if (use_mul_mat_q) {
2981
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
2982
  } else {
2983
+ opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
2984
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
2985
  }
2986
  }
 
3605
  std::exit(1);
3606
  }
3607
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3608
  static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
3609
  ggml_sycl_set_main_device(sycl_ctx->device);
 
3610
 
3611
  for (int i = 0; i < cgraph->n_nodes; i++) {
3612
  ggml_tensor * node = cgraph->nodes[i];