forked from ggml-org/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrdna2-patch.diff
More file actions
89 lines (83 loc) · 4.46 KB
/
rdna2-patch.diff
File metadata and controls
89 lines (83 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index beeb52389..e37819e34 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -1051,7 +1051,24 @@ void launch_fattn(
const dim3 block_dim(warp_size, nwarps, 1);
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
- GGML_ASSERT(max_blocks_per_sm > 0);
+#ifdef GGML_FATTN_TRACE
+ {
+ static int fattn_trace_count = 0;
+ if (fattn_trace_count++ < 16) {
+ GGML_LOG_INFO("[fattn-trace] %s\n", __PRETTY_FUNCTION__);
+ GGML_LOG_INFO("[fattn-trace] device=%d cc=%d nsm=%d warp_size=%d nwarps=%d threads/block=%d\n",
+ id, cc, nsm, warp_size, nwarps, (int)(block_dim.x * block_dim.y * block_dim.z));
+ GGML_LOG_INFO("[fattn-trace] nbytes_shared=%zu nbatch_fa=%d stream_k=%d need_f16_K=%d need_f16_V=%d\n",
+ nbytes_shared, nbatch_fa, (int)stream_k, (int)need_f16_K, (int)need_f16_V);
+ GGML_LOG_INFO("[fattn-trace] cudaOccupancyMaxActiveBlocksPerMultiprocessor -> max_blocks_per_sm=%d\n",
+ max_blocks_per_sm);
+ }
+ }
+#endif
+ if (max_blocks_per_sm <= 0) {
+ GGML_LOG_WARN("cudaOccupancyMaxActiveBlocksPerMultiprocessor returned %d, falling back to 1\n", max_blocks_per_sm);
+ max_blocks_per_sm = 1;
+ }
int parallel_blocks = max_blocks_per_sm;
const int ntiles_KV = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by KV cache length.
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index a25e912c4..1686e0655 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1950,6 +1950,10 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
#endif // !defined(GGML_USE_MUSA)
}
+#ifdef GGML_FATTN_TRACE
+ GGML_LOG_INFO("[fattn-path] mma (DKQ=%d, DV=%d, ncols1=%d, ncols2=%d, nwarps=%d, nbatch_fa=%d)\n",
+ DKQ, DV, ncols1, ncols2, nwarps, nbatch_fa);
+#endif
launch_fattn<DV, ncols1, ncols2>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, nbatch_fa, true, true, true, warp_size_host);
}
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
index 7b0a5e5cf..be6aeae28 100644
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -1145,6 +1145,11 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
const int cc = ggml_cuda_info().devices[id].cc;
const int warp_size = 32;
+#ifdef GGML_FATTN_TRACE
+ GGML_LOG_INFO("[fattn-path] tile (DKQ=%d, DV=%d, ncols2=%d, Q->ne[1]=%lld)\n",
+ DKQ, DV, ncols2, (long long)Q->ne[1]);
+#endif
+
constexpr size_t nbytes_shared = 0;
#ifdef GGML_USE_HIP
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh
index f0bd42a57..522901857 100644
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -529,6 +529,9 @@ void ggml_cuda_flash_attn_ext_vec_case_impl(ggml_backend_cuda_context & ctx, ggm
const bool need_f16_K = type_K == GGML_TYPE_F16;
const bool need_f16_V = type_V == GGML_TYPE_F16;
constexpr size_t nbytes_shared = 0;
+#ifdef GGML_FATTN_TRACE
+ GGML_LOG_INFO("[fattn-path] vec (D=%d, cols_per_block=%d, nwarps=%d)\n", D, cols_per_block, nwarps);
+#endif
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
}
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
index f19defbff..20dfde7c7 100644
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -552,6 +552,9 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
}
+#ifdef GGML_FATTN_TRACE
+ GGML_LOG_INFO("[fattn-path] wmma (D=%d, cols_per_block=%d, nwarps=%d)\n", D, cols_per_block, nwarps);
+#endif
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
}