Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 33 additions & 34 deletions ds4.c
Original file line number Diff line number Diff line change
Expand Up @@ -9617,15 +9617,19 @@ static bool metal_graph_encode_decode_layer(
if (ok) {
metal_graph_debug_dump_tensor("Qraw", g->q, q_dim, il, pos);
}
if (ok) ok = ds4_gpu_head_rms_norm_tensor(g->q, 1, DS4_N_HEAD, DS4_N_HEAD_DIM, DS4_RMS_EPS) != 0;
if (ok) {
metal_graph_debug_dump_tensor("Qnorm", g->q, q_dim, il, pos);
}
if (ok) ok = ds4_gpu_rope_tail_tensor(g->q, 1, DS4_N_HEAD, DS4_N_HEAD_DIM,
DS4_N_ROT, pos,
compressed ? (uint32_t)DS4_ROPE_ORIG_CTX : 0,
false, freq_base, freq_scale, ext_factor, attn_factor,
DS4_ROPE_YARN_BETA_FAST, DS4_ROPE_YARN_BETA_SLOW) != 0;
/* Fused head-rms-norm + RoPE rotation on Q (mainline already has the
* batched variant of this fusion implicit in some paths; the standalone
* fused kernel ds4_gpu_head_rms_norm_rope_tail_tensor saves one DRAM
* round trip and one kernel launch per layer on the decode hot path).
* Mathematically equivalent to the prior two-kernel sequence; FMA
* reordering may produce ULP-scale differences. */
if (ok) ok = ds4_gpu_head_rms_norm_rope_tail_tensor(g->q, 1, DS4_N_HEAD, DS4_N_HEAD_DIM,
DS4_N_ROT, pos,
compressed ? (uint32_t)DS4_ROPE_ORIG_CTX : 0,
false, freq_base, freq_scale,
ext_factor, attn_factor,
DS4_ROPE_YARN_BETA_FAST, DS4_ROPE_YARN_BETA_SLOW,
DS4_RMS_EPS) != 0;
DS4_METAL_PROFILE_DECODE_STAGE("q_path");
if (ok) {
metal_graph_debug_dump_tensor("Qcur", g->q, q_dim, il, pos);
Expand Down Expand Up @@ -11666,35 +11670,30 @@ static bool metal_graph_encode_layer_attention_batch(
(uint64_t)n_tokens * q_dim, il, pos0);
}
DS4_METAL_PROFILE_Q_STAGE("q_b");
if (ok) ok = ds4_gpu_head_rms_norm_tensor(g->batch_q,
n_tokens,
DS4_N_HEAD,
DS4_N_HEAD_DIM,
DS4_RMS_EPS) != 0;
if (ok) {
metal_graph_debug_dump_tensor("Qnorm", g->batch_q,
(uint64_t)n_tokens * q_dim, il, pos0);
}
DS4_METAL_PROFILE_Q_STAGE("head_norm");
if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_q,
n_tokens,
DS4_N_HEAD,
DS4_N_HEAD_DIM,
DS4_N_ROT,
pos0,
compressed ? (uint32_t)DS4_ROPE_ORIG_CTX : 0,
false,
freq_base,
freq_scale,
ext_factor,
attn_factor,
DS4_ROPE_YARN_BETA_FAST,
DS4_ROPE_YARN_BETA_SLOW) != 0;
/* Fused head-rms-norm + RoPE tail on Q (batched path). Replaces the
* head_rms_norm + rope_tail pair that ran sequentially; saves one DRAM
* round-trip and one launch per layer. ULP-scale FMA reordering may
* differ from the sequential pair. */
if (ok) ok = ds4_gpu_head_rms_norm_rope_tail_tensor(g->batch_q,
n_tokens,
DS4_N_HEAD,
DS4_N_HEAD_DIM,
DS4_N_ROT,
pos0,
compressed ? (uint32_t)DS4_ROPE_ORIG_CTX : 0,
false,
freq_base,
freq_scale,
ext_factor,
attn_factor,
DS4_ROPE_YARN_BETA_FAST,
DS4_ROPE_YARN_BETA_SLOW,
DS4_RMS_EPS) != 0;
if (ok) {
metal_graph_debug_dump_tensor("Qcur", g->batch_q,
(uint64_t)n_tokens * q_dim, il, pos0);
}
DS4_METAL_PROFILE_Q_STAGE("rope");
DS4_METAL_PROFILE_Q_STAGE("head_norm_rope");
DS4_METAL_PROFILE_ATTN_STAGE("q_path");
if (!qkv_rms_fused) {
if (ok) ok = metal_graph_matmul_q8_0_named_tensor("attn_kv",
Expand Down
113 changes: 111 additions & 2 deletions ds4_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2127,6 +2127,52 @@ __global__ static void matmul_q8_0_preq_batch_warp8_kernel(
if (lane == 0) out[tok * out_dim + row] = acc;
}

/* Shared-weight variant: each warp reads one row of weights once and
* computes N_TOK dot products against N_TOK different token inputs.
* Cuts weight-bandwidth N-fold vs the per-token kernel above. Used for
* small batches (MTP spec verify at N=2-4) where cuBLAS GEMM pads the
* tensor-core M tile (16 for f16) and wastes ~7/8 of the M-axis work. */
template <int N_TOK>
__global__ static void matmul_q8_0_preq_batch_share_warp_kernel(
float *out,
const unsigned char *w,
const int8_t *xq,
const float *xscale,
uint64_t in_dim,
uint64_t out_dim,
uint64_t blocks,
int use_dp4a) {
const uint64_t row = (uint64_t)blockIdx.x * 8u + (threadIdx.x >> 5u);
const uint32_t lane = threadIdx.x & 31u;
if (row >= out_dim) return;

const unsigned char *wr = w + row * blocks * 34;
float acc[N_TOK];
#pragma unroll
for (int t = 0; t < N_TOK; t++) acc[t] = 0.0f;

for (uint64_t b = lane; b < blocks; b += 32u) {
const uint64_t i0 = b * 32;
const uint64_t bn = in_dim - i0 < 32 ? in_dim - i0 : 32;
const __half *scale_h = (const __half *)(wr + b * 34);
const int8_t *qs = (const int8_t *)(wr + b * 34 + 2);
const float wscale = __half2float(*scale_h);
#pragma unroll
for (int t = 0; t < N_TOK; t++) {
const int8_t *xqb = xq + (uint64_t)t * blocks * 32 + b * 32;
const float xs = xscale[(uint64_t)t * blocks + b];
int dot = dot_i8_block(qs, xqb, bn, use_dp4a);
acc[t] += wscale * xs * (float)dot;
}
}
#pragma unroll
for (int t = 0; t < N_TOK; t++) acc[t] = warp_sum_f32(acc[t]);
if (lane == 0) {
#pragma unroll
for (int t = 0; t < N_TOK; t++) out[(uint64_t)t * out_dim + row] = acc[t];
}
}

__global__ static void dequant_q8_0_to_f16_kernel(
__half *out,
const unsigned char *w,
Expand Down Expand Up @@ -2368,8 +2414,15 @@ __global__ static void head_rms_norm_rope_tail_kernel(
float s = sinf(theta) * mscale;
if (inverse) s = -s;
float *tail = xr + n_nope;
float x0 = tail[i] * scale;
float x1 = tail[i + 1] * scale;
/* Match the sequential (rms-then-rope) numerical path: that path
* stores scale*tail[i] back to fp32 memory before the RoPE rotation
* reads it. Use __fmul_rn to force a single-rounded fp32 multiply
* for the scale step, preventing the compiler from fusing scale*x
* into the c/s multiply via FMA. Without this barrier the long-
* context (high pos0 -> large theta) drift compounds across layers
* and flips argmax decisions on long_memory_archive. */
float x0 = __fmul_rn(tail[i], scale);
float x1 = __fmul_rn(tail[i + 1], scale);
tail[i] = x0 * c - x1 * s;
tail[i + 1] = x0 * s + x1 * c;
}
Expand Down Expand Up @@ -5947,6 +6000,62 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
out->bytes < n_tok * out_dim * sizeof(float)) return 0;
const char *wptr = cuda_model_range_ptr(model_map, weight_offset, weight_bytes, "q8_0");
if (!wptr) return 0;
/* Small-batch shared-weight path: at n_tok = 2..4, the hand-rolled warp
* kernel that reads each weight row once and computes N dot products
* against N tokens replaces the per-token batch_warp8 kernel and is
* bit-identical to it (same blocks, same per-block FMA order, same warp
* reduction). Gate to the same conditions under which batch_warp8
* would have been chosen: no F32/F16 cuBLAS cache hit and blocks <= 32.
* Otherwise fall through so cuBLAS Gemm (the existing reference path)
* stays in charge for that weight. Disable with
* DS4_CUDA_NO_Q8_SHARE_BATCH=1. */
if (n_tok >= 2u && n_tok <= 4u && blocks <= 32u &&
getenv("DS4_CUDA_NO_Q8_SHARE_BATCH") == NULL &&
getenv("DS4_CUDA_NO_Q8_BATCH_WARP") == NULL &&
(!g_cublas_ready ||
(cuda_q8_f32_ptr(model_map, weight_offset, weight_bytes, in_dim, out_dim, label) == NULL &&
cuda_q8_f16_ptr(model_map, weight_offset, weight_bytes, in_dim, out_dim, label) == NULL))) {
const uint64_t share_xq_bytes = n_tok * blocks * 32u;
const uint64_t share_scale_offset = (share_xq_bytes + 15u) & ~15ull;
const uint64_t share_tmp_bytes = share_scale_offset + n_tok * blocks * sizeof(float);
void *share_tmp = cuda_tmp_alloc(share_tmp_bytes, "q8_0 share prequant");
if (share_tmp) {
int8_t *share_xq = (int8_t *)share_tmp;
float *share_xscale = (float *)((char *)share_tmp + share_scale_offset);
const int share_dp4a = cuda_q8_use_dp4a();
dim3 share_qgrid((unsigned)blocks, (unsigned)n_tok, 1);
quantize_q8_0_f32_kernel<<<share_qgrid, 32>>>(share_xq, share_xscale,
(const float *)x->ptr,
in_dim, blocks);
if (cuda_ok(cudaGetLastError(), "matmul_q8_0 share quantize launch")) {
const unsigned grid_x = ((unsigned)out_dim + 7u) / 8u;
bool launched = false;
if (n_tok == 2u) {
matmul_q8_0_preq_batch_share_warp_kernel<2><<<grid_x, 256>>>(
(float *)out->ptr,
reinterpret_cast<const unsigned char *>(wptr),
share_xq, share_xscale, in_dim, out_dim, blocks, share_dp4a);
launched = true;
} else if (n_tok == 3u) {
matmul_q8_0_preq_batch_share_warp_kernel<3><<<grid_x, 256>>>(
(float *)out->ptr,
reinterpret_cast<const unsigned char *>(wptr),
share_xq, share_xscale, in_dim, out_dim, blocks, share_dp4a);
launched = true;
} else if (n_tok == 4u) {
matmul_q8_0_preq_batch_share_warp_kernel<4><<<grid_x, 256>>>(
(float *)out->ptr,
reinterpret_cast<const unsigned char *>(wptr),
share_xq, share_xscale, in_dim, out_dim, blocks, share_dp4a);
launched = true;
}
if (launched && cuda_ok(cudaGetLastError(), "matmul_q8_0 share warp launch")) {
return 1;
}
}
}
/* Falls through to cuBLAS / fallback if anything above failed. */
}
if (g_cublas_ready && n_tok > 1) {
const float *w_f32 = cuda_q8_f32_ptr(model_map, weight_offset, weight_bytes, in_dim, out_dim, label);
if (w_f32) {
Expand Down
22 changes: 22 additions & 0 deletions ds4_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,28 @@ int ds4_gpu_rope_tail_tensor(
float beta_fast,
float beta_slow);

/* Fused per-head RMS norm + RoPE tail rotation on Q-style tensors.
* Mathematically equivalent to head_rms_norm_tensor + rope_tail_tensor
* applied back-to-back, but in a single kernel — saves one DRAM
* round-trip + one launch per call. ULP-scale FMA reordering may differ
* from the sequential pair. */
int ds4_gpu_head_rms_norm_rope_tail_tensor(
ds4_gpu_tensor *x,
uint32_t n_tok,
uint32_t n_head,
uint32_t head_dim,
uint32_t n_rot,
uint32_t pos0,
uint32_t n_ctx_orig,
bool inverse,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow,
float eps);

/* Release decode fused KV finalizer: after the standalone RoPE kernel, this
* performs DS4's FP8 non-RoPE KV round trip and writes the F16-rounded raw
* attention cache row in one dispatch. */
Expand Down