From eed42a9dfa2d81358344689f04489517ee8c0510 Mon Sep 17 00:00:00 2001 From: Gino Lu Date: Thu, 19 Mar 2026 23:28:36 -0400 Subject: [PATCH 1/7] Add host-side Sparge block-map pipeline for sparse attention examples - Add sparge_tool.hpp: host-side Sparge block-map builder (mean-sim scoring, CDF/topk selection) and VSA delta-LUT converter. - Add test_sparge_jenga_sparse_attn.cpp and test_sparge_vsa_sparse_attn.cpp as end-to-end demos. - Update CMakeLists.txt to register both new executables. Note: block size is currently fixed at 128; flexible block size support is not yet addressed. --- example/ck_tile/50_sparse_attn/CMakeLists.txt | 22 + .../ck_tile/50_sparse_attn/sparge_tool.hpp | 408 +++++++++++++++++ .../test_sparge_jenga_sparse_attn.cpp | 422 +++++++++++++++++ .../test_sparge_vsa_sparse_attn.cpp | 429 ++++++++++++++++++ 4 files changed, 1281 insertions(+) create mode 100644 example/ck_tile/50_sparse_attn/sparge_tool.hpp create mode 100644 example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp create mode 100644 example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp diff --git a/example/ck_tile/50_sparse_attn/CMakeLists.txt b/example/ck_tile/50_sparse_attn/CMakeLists.txt index 65bb2077642..c916f642ebb 100644 --- a/example/ck_tile/50_sparse_attn/CMakeLists.txt +++ b/example/ck_tile/50_sparse_attn/CMakeLists.txt @@ -88,6 +88,17 @@ target_compile_options(${EXAMPLE_JENGA_SPARSE_ATTN} PRIVATE -Wno-float-equal ) +# Sparge + Jenga Example executable +set(EXAMPLE_SPARGE_JENGA_SPARSE_ATTN "tile_example_sparge_jenga_sparse_attn") +message(DEBUG "adding example ${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN}") +add_executable(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} EXCLUDE_FROM_ALL test_sparge_jenga_sparse_attn.cpp) +target_link_libraries(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} ${SPARSE_ATTN_JENGA_INSTANCES}) +target_include_directories(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_compile_options(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal +) + # ============================================================================ # VSA Sparse Attention # ============================================================================ @@ -153,4 +164,15 @@ target_compile_options(${EXAMPLE_VSA_SPARSE_ATTN} PRIVATE -Wno-float-equal ) +# Sparge + VSA Example executable +set(EXAMPLE_SPARGE_VSA_SPARSE_ATTN "tile_example_sparge_vsa_sparse_attn") +message(DEBUG "adding example ${EXAMPLE_SPARGE_VSA_SPARSE_ATTN}") +add_executable(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} EXCLUDE_FROM_ALL test_sparge_vsa_sparse_attn.cpp) +target_link_libraries(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} ${SPARSE_ATTN_VSA_INSTANCES}) +target_include_directories(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_compile_options(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal +) + set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) diff --git a/example/ck_tile/50_sparse_attn/sparge_tool.hpp b/example/ck_tile/50_sparse_attn/sparge_tool.hpp new file mode 100644 index 00000000000..49c69cc6f74 --- /dev/null +++ b/example/ck_tile/50_sparse_attn/sparge_tool.hpp @@ -0,0 +1,408 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" + +namespace sparge { + +struct SpargeParams +{ + int BLKQ = 128; + int BLKK = 128; + + // Similarity gate threshold (TODO: per-head support). + float simthreshd1 = 0.6f; + + // Exactly one of the following should be used: + // - Use CDF threshold if topk < 0 + // - Both should be in [0, 1] <-- NEED TO CHECK THIS + float cdfthreshd = 0.98f; + float topk = -1.0f; + + // If true, treat Q/K as BHSD; otherwise BSHD (same convention as CK examples). + bool i_perm = true; +}; + +// Output format CK VSA expects. +struct VSALut +{ + ck_tile::HostTensor lut; // [B, Hq, Q_blk, K_blk] delta-encoded + ck_tile::HostTensor valid_block_num; // [B, Hq, Q_blk] +}; + +namespace detail { + +template +inline float to_f32(const T& x) +{ + return ck_tile::type_convert(x); +} + +// Read element from HostTensor with either BHSD or BSHD layout. +// Q: [B, Hq, Sq, D] if i_perm else [B, Sq, Hq, D] +// K: [B, Hk, Sk, D] if i_perm else [B, Sk, Hk, D] +template +inline float load(const ck_tile::HostTensor& X, bool i_perm, int b, int h, int s, int d) +{ + return i_perm ? to_f32(X(b, h, s, d)) : to_f32(X(b, s, h, d)); +} + +// Compute pooled mean vector of one block: mean over tokens in [s0, s1). +template +std::vector +pooled_mean_block(const ck_tile::HostTensor& X, bool i_perm, int b, int h, int s0, int s1, int d) +{ + std::vector mean(d, 0.0f); + const int bs = std::max(0, s1 - s0); + if(bs == 0) + return mean; + + for(int s = s0; s < s1; ++s) + { + for(int d_ = 0; d_ < d; ++d_) + { + mean[d_] += load(X, i_perm, b, h, s, d_); + } + } + const float inv = 1.0f / static_cast(bs); + for(int d_ = 0; d_ < d; ++d_) + mean[d_] *= inv; + return mean; +} + +// Compute "sim" flag of one block following SpargeAttn's intent: +// mean_sim = sum(Gram(x_hat)) / (BS_*BS_), where x_hat are token vectors normalized along D. +// +// Important: sum(Gram) = ||sum_i x_hat_i||^2, so we can compute it in O(BS_*D) exactly +// instead of O(BS_^2 * D). +template +bool sim_block_flag(const ck_tile::HostTensor& X, + bool i_perm, + int b, + int h, + int s0, + int s1, + int d, + float simthreshd1) +{ + const int bs = std::max(0, s1 - s0); + if(bs == 0) + return false; + + std::vector sum_hat(d, 0.0f); + + for(int s = s0; s < s1; ++s) + { + // Compute L2 norm over D. + float norm2 = 0.0f; + for(int d_ = 0; d_ < d; ++d_) + { + const float v = load(X, i_perm, b, h, s, d_); + norm2 += v * v; + } + float inv_norm = 1.0f; + // spargeAttn use eps to prevent division by zero + if(norm2 > 0.0f) + inv_norm = 1.0f / std::sqrt(norm2); + + // Accumulate normalized vector. + for(int d_ = 0; d_ < d; ++d_) + { + sum_hat[d_] += load(X, i_perm, b, h, s, d_) * inv_norm; + } + } + + float sum_gram = 0.0f; + for(int d_ = 0; d_ < d; ++d_) + sum_gram += sum_hat[d_] * sum_hat[d_]; + + const float denom = static_cast(bs) * static_cast(bs); + const float mean_sim = sum_gram / denom; + + return mean_sim > simthreshd1; +} + +inline int select_count_from_cdf(const std::vector& sorted_probs, float cdfthreshd) +{ + // Choose the smallest n such that cdf[n-1] >= cdfthreshd. + // Ensure at least 1. + if(sorted_probs.empty()) + return 0; + if(cdfthreshd <= 0.0f) + return 1; + + float c = 0.0f; + for(int i = 0; i < static_cast(sorted_probs.size()); ++i) + { + c += sorted_probs[i]; + if(c >= cdfthreshd) + return i + 1; + } + return static_cast(sorted_probs.size()); +} + +inline int select_count_from_topk(int K_blk, float topk) +{ + if(K_blk <= 0) + return 0; + int n = static_cast(std::floor(topk * static_cast(K_blk))); + n = std::max(1, n); + return n; +} + +} // namespace detail + +// Build one-hot block_map[b,hq,qb,kb] in {0,1}. +// - No causal mask +// - No attention sink +// - Logic matches SpargeAttn's structure: +// - score softmax is only over sim_kblocks; ~sim_kblocks are forced ON later +// - if a Q-block is not "similar", force the whole row ON +template +ck_tile::HostTensor build_block_map_meansim(const ck_tile::HostTensor& Q, + const ck_tile::HostTensor& K, + const SpargeParams& p) +{ + const auto qlens = Q.get_lengths(); + const auto klens = K.get_lengths(); + + const int B = static_cast(qlens[0]); + const int Hq = p.i_perm ? static_cast(qlens[1]) : static_cast(qlens[2]); + const int Sq = p.i_perm ? static_cast(qlens[2]) : static_cast(qlens[1]); + const int D = static_cast(qlens[3]); + + [[maybe_unused]] const int Bk = static_cast(klens[0]); + const int Hk = p.i_perm ? static_cast(klens[1]) : static_cast(klens[2]); + const int Sk = p.i_perm ? static_cast(klens[2]) : static_cast(klens[1]); + [[maybe_unused]] const int Dk = static_cast(klens[3]); + + assert(B == Bk && D == Dk && Hq % Hk == 0); + assert(p.BLKQ > 0 && p.BLKK > 0); + + const int nhead_ratio_qk = Hq / Hk; + const int Q_blk = ck_tile::integer_divide_ceil(Sq, p.BLKQ); + const int K_blk = ck_tile::integer_divide_ceil(Sk, p.BLKK); + + ck_tile::HostTensor block_map({B, Hq, Q_blk, K_blk}); + + // pooled_q: [B,Hq,Q_blk,D], pooled_k: [B,Hk,K_blk,D] + // sim_q: [B,Hq,Q_blk], sim_k: [B,Hk,K_blk] + std::vector pooled_q(static_cast(B) * Hq * Q_blk * D, 0.0f); + std::vector pooled_k(static_cast(B) * Hk * K_blk * D, 0.0f); + std::vector sim_q(static_cast(B) * Hq * Q_blk, 0); + std::vector sim_k(static_cast(B) * Hk * K_blk, 0); + + auto idx_pq = [&](int b, int hq, int qb, int d) { + return (((b * Hq + hq) * Q_blk + qb) * D + d); + }; + auto idx_pk = [&](int b, int hk, int kb, int d) { + return (((b * Hk + hk) * K_blk + kb) * D + d); + }; + auto idx_sq = [&](int b, int hq, int qb) { return ((b * Hq + hq) * Q_blk + qb); }; + auto idx_sk = [&](int b, int hk, int kb) { return ((b * Hk + hk) * K_blk + kb); }; + + for(int b = 0; b < B; ++b) + { + for(int hq = 0; hq < Hq; ++hq) + { + // Q blocks + for(int qb = 0; qb < Q_blk; ++qb) + { + const int s0 = qb * p.BLKQ; + const int s1 = std::min(Sq, (qb + 1) * p.BLKQ); + + // pooled mean + auto mean = detail::pooled_mean_block(Q, p.i_perm, b, hq, s0, s1, D); + for(int d = 0; d < D; ++d) + pooled_q[idx_pq(b, hq, qb, d)] = mean[d]; + + // sim flag + sim_q[idx_sq(b, hq, qb)] = + detail::sim_block_flag(Q, p.i_perm, b, hq, s0, s1, D, p.simthreshd1) ? 1 : 0; + } + } + + for(int hk = 0; hk < Hk; ++hk) + { + // K blocks + for(int kb = 0; kb < K_blk; ++kb) + { + const int s0 = kb * p.BLKK; + const int s1 = std::min(Sk, (kb + 1) * p.BLKK); + + auto mean = detail::pooled_mean_block(K, p.i_perm, b, hk, s0, s1, D); + for(int d = 0; d < D; ++d) + pooled_k[idx_pk(b, hk, kb, d)] = mean[d]; + + sim_k[idx_sk(b, hk, kb)] = + detail::sim_block_flag(K, p.i_perm, b, hk, s0, s1, D, p.simthreshd1) ? 1 : 0; + } + } + } + + const float scale = 1.0f / std::sqrt(static_cast(D)); + + // Main loop + for(int b = 0; b < B; ++b) + { + for(int hq = 0; hq < Hq; ++hq) + { + const int hk = hq / nhead_ratio_qk; + + for(int qb = 0; qb < Q_blk; ++qb) + { + const bool q_is_sim = (sim_q[idx_sq(b, hq, qb)] != 0); + + // If Q-block is not "similar", force dense row. + if(!q_is_sim) + { + for(int kb = 0; kb < K_blk; ++kb) + block_map(b, hq, qb, kb) = 1; + continue; + } + + // Compute scores over K blocks (only sim_kblocks participate in softmax; others set + // to -inf). + std::vector score(K_blk, -std::numeric_limits::infinity()); + for(int kb = 0; kb < K_blk; ++kb) + { + const bool k_is_sim = (sim_k[idx_sk(b, hk, kb)] != 0); + if(!k_is_sim) + { + block_map(b, hq, qb, kb) = 1; + continue; + } + + float dot = 0.0f; + for(int d = 0; d < D; ++d) + { + dot += pooled_q[idx_pq(b, hq, qb, d)] * pooled_k[idx_pk(b, hk, kb, d)]; + } + score[kb] = dot * scale; + } + + // Softmax over K_blk (numerically stable). If all -inf, probs become all zeros. + float maxv = -std::numeric_limits::infinity(); + for(int kb = 0; kb < K_blk; ++kb) + maxv = std::max(maxv, score[kb]); + + std::vector prob(K_blk, 0.0f); + if(std::isfinite(maxv)) + { + float sumexp = 0.0f; + for(int kb = 0; kb < K_blk; ++kb) + { + if(!std::isfinite(score[kb])) + continue; + const float e = std::exp(score[kb] - maxv); + prob[kb] = e; + sumexp += e; + } + if(sumexp > 0.0f) + { + const float inv = 1.0f / sumexp; + for(int kb = 0; kb < K_blk; ++kb) + prob[kb] *= inv; + } + else + { + // All exponentials underflowed: keep zeros. + std::fill(prob.begin(), prob.end(), 0.0f); + } + } + + // Sort indices by prob descending. + std::vector order(K_blk); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](int a, int c) { + if(prob[a] != prob[c]) + return prob[a] > prob[c]; + return a < c; // tie-breaker for determinism + }); + + // Determine how many to select. + int num_to_select = 0; + if(p.topk > 0.0f) + { + num_to_select = detail::select_count_from_topk(K_blk, p.topk); + } + else + { + // Use CDF threshold selection (smallest n s.t. cumulative prob >= cdfthreshd). + std::vector sorted_probs(K_blk); + for(int i = 0; i < K_blk; ++i) + sorted_probs[i] = prob[order[i]]; + num_to_select = detail::select_count_from_cdf(sorted_probs, p.cdfthreshd); + num_to_select = std::max(1, num_to_select); + } + + // Select top-kb blocks by order[0..num_to_select-1]. + for(int i = 0; i < num_to_select; ++i) + { + const int kb = order[i]; + block_map(b, hq, qb, kb) = 1; + } + } + } + } + + return block_map; +} + +// Convert one-hot block_map -> delta-encoded LUT + valid_block_num (CK VSA format). +template +VSALut block_map_to_vsa_lut_delta(const ck_tile::HostTensor& block_map) +{ + const auto lens = block_map.get_lengths(); + const int B = static_cast(lens[0]); + const int H = static_cast(lens[1]); + const int Q = static_cast(lens[2]); + const int K = static_cast(lens[3]); + + VSALut out{ + ck_tile::HostTensor({B, H, Q, K}), + ck_tile::HostTensor({B, H, Q}), + }; + + for(int b = 0; b < B; ++b) + { + for(int h = 0; h < H; ++h) + { + for(int q = 0; q < Q; ++q) + { + int32_t valid = 0; + int32_t prev = 0; + + for(int k = 0; k < K; ++k) + { + const bool on = static_cast(block_map(b, h, q, k)) != 0; + if(on) + { + out.lut(b, h, q, valid) = static_cast(k - prev); + prev = static_cast(k); + ++valid; + } + } + + out.valid_block_num(b, h, q) = valid; + + // Optional: zero-fill the unused tail for determinism. + for(int i = valid; i < K; ++i) + out.lut(b, h, q, i) = 0; + } + } + } + + return out; +} + +} // namespace sparge diff --git a/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp b/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp new file mode 100644 index 00000000000..0bd664adf68 --- /dev/null +++ b/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp @@ -0,0 +1,422 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +// Demo: Sparge block-map -> Jenga sparse attention + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/reference/reference_blocked_attention.hpp" +#include "ck_tile/core/utility/bit_cast.hpp" + +#include "jenga_sparse_attention.h" +#include "sparge_tool.hpp" + +// ============================================================================ +// Helper Functions +// ============================================================================ + +template +ck_tile::HostTensor make_qkv_tensor(ck_tile::index_t batch, + ck_tile::index_t nhead, + ck_tile::index_t seqlen, + ck_tile::index_t hdim, + bool i_perm) +{ + if(i_perm) + { + return ck_tile::HostTensor({batch, nhead, seqlen, hdim}); + } + return ck_tile::HostTensor({batch, seqlen, nhead, hdim}); +} + +template +ck_tile::HostTensor to_bhsd(const ck_tile::HostTensor& tensor, bool is_bhsd) +{ + auto lens = tensor.get_lengths(); + ck_tile::index_t batch = lens[0]; + ck_tile::index_t seqlen = is_bhsd ? lens[2] : lens[1]; + ck_tile::index_t nhead = is_bhsd ? lens[1] : lens[2]; + ck_tile::index_t hdim = lens[3]; + + ck_tile::HostTensor out({batch, nhead, seqlen, hdim}); + for(ck_tile::index_t b = 0; b < batch; ++b) + { + for(ck_tile::index_t h = 0; h < nhead; ++h) + { + for(ck_tile::index_t s = 0; s < seqlen; ++s) + { + for(ck_tile::index_t d = 0; d < hdim; ++d) + { + out(b, h, s, d) = is_bhsd ? tensor(b, h, s, d) : tensor(b, s, h, d); + } + } + } + } + return out; +} + +template +auto get_error_tolerance() +{ + double rtol = 1e-2; + double atol = 4e-2; + if constexpr(std::is_same_v) + { + atol = 2e-1; + rtol = 2e-1; + } + return ck_tile::make_tuple(rtol, atol); +} + +template +float to_float_for_compare(T value) +{ + return static_cast(value); +} + +template <> +float to_float_for_compare(ck_tile::bf16_t value) +{ +#if CK_TILE_USE_CUSTOM_DATA_TYPE + return static_cast(value); +#else + return ck_tile::bf16_to_float_raw(ck_tile::bit_cast(value)); +#endif +} + +// ============================================================================ +// Command line argument parser +// ============================================================================ + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("v", "1", "0:no validation, 1:cpu validation") + .insert("b", "1", "batch size") + .insert("h", "4", "num of head for q") + .insert("h_k", "-1", "num of head for k/v, -1 means equal to h") + .insert("s", "4096", "seqlen_q") + .insert("s_k", "-1", "seqlen_k, -1 means equal to s") + .insert("d", "128", "head dim for q, k") + .insert("d_v", "-1", "head dim for v, -1 means equal to d") + .insert("prec", "fp16", "data type: fp16/bf16") + .insert("iperm", "1", "permute input, 1: b*h*s*d, 0: b*s*h*d") + .insert("operm", "1", "permute output") + .insert("seed", "42", "random seed") + .insert("warmup", "5", "warmup iterations") + .insert("repeat", "20", "benchmark iterations") + .insert("kname", "0", "print kernel name") + // Sparge-specific + .insert("blkq", "128", "Sparge BLKQ") + .insert("blkk", "128", "Sparge BLKK") + .insert("simthreshd1", "0.6", "Sparge sim threshold") + .insert("cdfthreshd", "0.98", "Sparge CDF threshold (used when topk < 0)") + .insert("topk", "-1.0", "Sparge topk ratio in (0,1]; if > 0, overrides cdfthreshd"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// ============================================================================ +// Main Test Function +// ============================================================================ + +template +bool run_test(const ck_tile::ArgParser& arg_parser) +{ + int do_validation = arg_parser.get_int("v"); + ck_tile::index_t batch = arg_parser.get_int("b"); + ck_tile::index_t nhead = arg_parser.get_int("h"); + ck_tile::index_t nhead_k = arg_parser.get_int("h_k"); + ck_tile::index_t seqlen_q = arg_parser.get_int("s"); + ck_tile::index_t seqlen_k = arg_parser.get_int("s_k"); + ck_tile::index_t hdim_q = arg_parser.get_int("d"); + ck_tile::index_t hdim_v = arg_parser.get_int("d_v"); + bool i_perm = arg_parser.get_bool("iperm"); + bool o_perm = arg_parser.get_bool("operm"); + uint32_t seed = arg_parser.get_uint32("seed"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + int kname = arg_parser.get_int("kname"); + + // Sparge params + ck_tile::index_t blkq = arg_parser.get_int("blkq"); + ck_tile::index_t blkk = arg_parser.get_int("blkk"); + float simthreshd1 = arg_parser.get_float("simthreshd1"); + float cdfthreshd = arg_parser.get_float("cdfthreshd"); + float topk = arg_parser.get_float("topk"); + + if(nhead_k < 0) + nhead_k = nhead; + if(seqlen_k < 0) + seqlen_k = seqlen_q; + if(hdim_v < 0) + hdim_v = hdim_q; + + if(blkq != 128 || blkk != 128 || hdim_q != 128 || hdim_v != 128) + { + std::cout << "\n>>> TEST SKIPPED <<<" << std::endl; + std::cout << "Jenga/VSA kernel instances are generated for BLKQ=BLKK=128, " + "hdim_q=128, hdim_v=128 only." + << std::endl; + std::cout << "TEST SKIPPED" << std::endl; + return true; + } + + ck_tile::index_t BLKQ = blkq; + ck_tile::index_t BLKK = blkk; + + ck_tile::index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ; + ck_tile::index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK; + + std::cout << "============================================================" << std::endl; + std::cout << "[Sparge -> Jenga Sparse Attention Demo]" << std::endl; + std::cout << "============================================================" << std::endl; + std::cout << " Batch: " << batch << ", nhead_q: " << nhead << ", nhead_k: " << nhead_k + << std::endl; + std::cout << " seqlen_q: " << seqlen_q << ", seqlen_k: " << seqlen_k << std::endl; + std::cout << " hdim_q: " << hdim_q << ", hdim_v: " << hdim_v << std::endl; + std::cout << " BLKQ=" << BLKQ << ", BLKK=" << BLKK << std::endl; + std::cout << " num_q_blocks: " << num_q_blocks << ", num_k_blocks: " << num_k_blocks + << std::endl; + std::cout << " Sparge(simthreshd1=" << simthreshd1 << ", cdfthreshd=" << cdfthreshd + << ", topk=" << topk << ")" << std::endl; + std::cout << " i_perm: " << i_perm << ", o_perm: " << o_perm << std::endl; + + // Create host tensors + ck_tile::HostTensor q_host = make_qkv_tensor(batch, nhead, seqlen_q, hdim_q, i_perm); + ck_tile::HostTensor k_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_q, i_perm); + ck_tile::HostTensor v_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_v, i_perm); + ck_tile::HostTensor output_host = + o_perm ? ck_tile::HostTensor({batch, nhead, seqlen_q, hdim_v}) + : ck_tile::HostTensor({batch, seqlen_q, nhead, hdim_v}); + ck_tile::HostTensor output_ref({batch, nhead, seqlen_q, hdim_v}); + + std::cout << "\nInitializing tensors..." << std::endl; + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed}(q_host); + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 1}(k_host); + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 2}(v_host); + + // Build block map using Sparge tool + std::cout << "Building Sparge block map..." << std::endl; + sparge::SpargeParams p; + p.BLKQ = static_cast(BLKQ); + p.BLKK = static_cast(BLKK); + p.simthreshd1 = simthreshd1; + p.cdfthreshd = cdfthreshd; + p.topk = topk; + p.i_perm = i_perm; + + ck_tile::HostTensor block_relation_onehot = + sparge::build_block_map_meansim(q_host, k_host, p); + + // Print actual sparsity + std::size_t total_blocks = 0; + std::size_t active_blocks = 0; + for(ck_tile::index_t b = 0; b < batch; ++b) + { + for(ck_tile::index_t h = 0; h < nhead; ++h) + { + for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) + { + for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) + { + total_blocks++; + if(block_relation_onehot(b, h, qb, kb) != 0) + active_blocks++; + } + } + } + } + float actual_sparsity = + 1.0f - static_cast(active_blocks) / static_cast(total_blocks); + std::cout << " Actual sparsity: " << actual_sparsity << " (" << active_blocks << "/" + << total_blocks << " blocks active)" << std::endl; + + std::cout << "\n--- Running Jenga sparse attention kernel ---" << std::endl; + + try + { + if(kname) + { + jenga_sparse_attention(q_host, + k_host, + v_host, + block_relation_onehot, + output_host, + batch, + nhead, + nhead_k, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + i_perm, + o_perm, + seqlen_q, + seqlen_k, + 1); + } + + for(int i = 0; i < warmup; ++i) + { + jenga_sparse_attention(q_host, + k_host, + v_host, + block_relation_onehot, + output_host, + batch, + nhead, + nhead_k, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + i_perm, + o_perm, + seqlen_q, + seqlen_k, + 0); + } + + [[maybe_unused]] auto sync_status1 = hipDeviceSynchronize(); + auto start = std::chrono::high_resolution_clock::now(); + + for(int i = 0; i < repeat; ++i) + { + jenga_sparse_attention(q_host, + k_host, + v_host, + block_relation_onehot, + output_host, + batch, + nhead, + nhead_k, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + i_perm, + o_perm, + seqlen_q, + seqlen_k, + 0); + } + + [[maybe_unused]] auto sync_status2 = hipDeviceSynchronize(); + auto end = std::chrono::high_resolution_clock::now(); + double avg_time_ms = + std::chrono::duration(end - start).count() / repeat; + + std::cout << "\n>>>> Jenga sparse attention average time: " << avg_time_ms << " ms <<<<" + << std::endl; + } + catch(const std::exception& e) + { + std::cerr << "Error during kernel execution: " << e.what() << std::endl; + return false; + } + + bool pass = true; + if(do_validation) + { + std::cout << "\n--- Performing CPU validation ---" << std::endl; + float scale = 1.0f / std::sqrt(static_cast(hdim_q)); + + std::cout << "Computing reference output..." << std::endl; + auto q_ref = to_bhsd(q_host, i_perm); + auto k_ref = to_bhsd(k_host, i_perm); + auto v_ref = to_bhsd(v_host, i_perm); + + ck_tile::reference_blocked_attention( + q_ref, k_ref, v_ref, block_relation_onehot, output_ref, BLKQ, BLKK, scale); + + auto [rtol, atol] = get_error_tolerance(); + + float max_diff = 0.0f; + float max_rel_diff = 0.0f; + std::size_t num_errors = 0; + + auto output_host_bhsd = to_bhsd(output_host, o_perm); + for(std::size_t i = 0; i < output_host_bhsd.mData.size(); ++i) + { + float gpu_val = to_float_for_compare(output_host_bhsd.mData[i]); + float ref_val = to_float_for_compare(output_ref.mData[i]); + float diff = std::abs(gpu_val - ref_val); + float rel_diff = (std::abs(ref_val) > 1e-6f) ? diff / std::abs(ref_val) : diff; + + max_diff = std::max(max_diff, diff); + max_rel_diff = std::max(max_rel_diff, rel_diff); + + if(diff > atol && rel_diff > rtol) + { + num_errors++; + if(num_errors <= 5) + { + std::cout << " Mismatch at index " << i << ": GPU=" << gpu_val + << ", Ref=" << ref_val << ", Diff=" << diff << std::endl; + } + } + } + + std::cout << "\nValidation results:" << std::endl; + std::cout << " Max absolute difference: " << max_diff << std::endl; + std::cout << " Max relative difference: " << max_rel_diff << std::endl; + std::cout << " Number of mismatches: " << num_errors << " / " + << output_host_bhsd.mData.size() << std::endl; + + if(num_errors == 0) + { + std::cout << "\n>>> VALIDATION PASSED <<<" << std::endl; + } + else + { + std::cout << "\n>>> VALIDATION FAILED <<<" << std::endl; + pass = false; + } + } + + std::cout << "\n" << (pass ? "TEST PASSED" : "TEST FAILED") << std::endl; + return pass; +} + +// ============================================================================ +// Main +// ============================================================================ + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + { + std::cerr << "Failed to parse arguments" << std::endl; + return -1; + } + + std::string prec = arg_parser.get_str("prec"); + + bool test_result = false; + if(prec == "fp16") + { + test_result = run_test(arg_parser); + } + else if(prec == "bf16") + { + test_result = run_test(arg_parser); + } + else + { + std::cerr << "Unsupported precision: " << prec << std::endl; + return -1; + } + + return test_result ? 0 : -1; +} diff --git a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp new file mode 100644 index 00000000000..dd1d3e60bee --- /dev/null +++ b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp @@ -0,0 +1,429 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +// Demo: Sparge block-map -> (delta LUT) -> VSA sparse attention + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/reference/reference_blocked_attention.hpp" +#include "ck_tile/core/utility/bit_cast.hpp" + +#include "jenga_sparse_attention.h" +#include "sparge_tool.hpp" + +// ============================================================================ +// Helper Functions +// ============================================================================ + +template +ck_tile::HostTensor make_qkv_tensor(ck_tile::index_t batch, + ck_tile::index_t nhead, + ck_tile::index_t seqlen, + ck_tile::index_t hdim, + bool i_perm) +{ + if(i_perm) + { + return ck_tile::HostTensor({batch, nhead, seqlen, hdim}); + } + return ck_tile::HostTensor({batch, seqlen, nhead, hdim}); +} + +template +ck_tile::HostTensor to_bhsd(const ck_tile::HostTensor& tensor, bool is_bhsd) +{ + auto lens = tensor.get_lengths(); + ck_tile::index_t batch = lens[0]; + ck_tile::index_t seqlen = is_bhsd ? lens[2] : lens[1]; + ck_tile::index_t nhead = is_bhsd ? lens[1] : lens[2]; + ck_tile::index_t hdim = lens[3]; + + ck_tile::HostTensor out({batch, nhead, seqlen, hdim}); + for(ck_tile::index_t b = 0; b < batch; ++b) + { + for(ck_tile::index_t h = 0; h < nhead; ++h) + { + for(ck_tile::index_t s = 0; s < seqlen; ++s) + { + for(ck_tile::index_t d = 0; d < hdim; ++d) + { + out(b, h, s, d) = is_bhsd ? tensor(b, h, s, d) : tensor(b, s, h, d); + } + } + } + } + return out; +} + +template +auto get_error_tolerance() +{ + double rtol = 1e-2; + double atol = 4e-2; + if constexpr(std::is_same_v) + { + atol = 2e-1; + rtol = 2e-1; + } + return ck_tile::make_tuple(rtol, atol); +} + +template +float to_float_for_compare(T value) +{ + return static_cast(value); +} + +template <> +float to_float_for_compare(ck_tile::bf16_t value) +{ +#if CK_TILE_USE_CUSTOM_DATA_TYPE + return static_cast(value); +#else + return ck_tile::bf16_to_float_raw(ck_tile::bit_cast(value)); +#endif +} + +// ============================================================================ +// Command line argument parser +// ============================================================================ + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("v", "1", "0:no validation, 1:cpu validation") + .insert("b", "1", "batch size") + .insert("h", "4", "num of head for q") + .insert("h_k", "-1", "num of head for k/v, -1 means equal to h") + .insert("s", "4096", "seqlen_q") + .insert("s_k", "-1", "seqlen_k, -1 means equal to s") + .insert("d", "128", "head dim for q, k") + .insert("d_v", "-1", "head dim for v, -1 means equal to d") + .insert("prec", "fp16", "data type: fp16/bf16") + .insert("iperm", "1", "permute input, 1: b*h*s*d, 0: b*s*h*d") + .insert("operm", "1", "permute output") + .insert("seed", "42", "random seed") + .insert("warmup", "5", "warmup iterations") + .insert("repeat", "20", "benchmark iterations") + .insert("kname", "0", "print kernel name") + // Sparge-specific + .insert("blkq", "128", "Sparge BLKQ") + .insert("blkk", "128", "Sparge BLKK") + .insert("simthreshd1", "0.6", "Sparge sim threshold") + .insert("cdfthreshd", "0.98", "Sparge CDF threshold (used when topk < 0)") + .insert("topk", "-1.0", "Sparge topk ratio in (0,1]; if > 0, overrides cdfthreshd"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// ============================================================================ +// Main Test Function +// ============================================================================ + +template +bool run_test(const ck_tile::ArgParser& arg_parser) +{ + int do_validation = arg_parser.get_int("v"); + ck_tile::index_t batch = arg_parser.get_int("b"); + ck_tile::index_t nhead = arg_parser.get_int("h"); + ck_tile::index_t nhead_k = arg_parser.get_int("h_k"); + ck_tile::index_t seqlen_q = arg_parser.get_int("s"); + ck_tile::index_t seqlen_k = arg_parser.get_int("s_k"); + ck_tile::index_t hdim_q = arg_parser.get_int("d"); + ck_tile::index_t hdim_v = arg_parser.get_int("d_v"); + bool i_perm = arg_parser.get_bool("iperm"); + bool o_perm = arg_parser.get_bool("operm"); + uint32_t seed = arg_parser.get_uint32("seed"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + int kname = arg_parser.get_int("kname"); + + // Sparge params + ck_tile::index_t blkq = arg_parser.get_int("blkq"); + ck_tile::index_t blkk = arg_parser.get_int("blkk"); + float simthreshd1 = arg_parser.get_float("simthreshd1"); + float cdfthreshd = arg_parser.get_float("cdfthreshd"); + float topk = arg_parser.get_float("topk"); + + if(nhead_k < 0) + nhead_k = nhead; + if(seqlen_k < 0) + seqlen_k = seqlen_q; + if(hdim_v < 0) + hdim_v = hdim_q; + + if(blkq != 128 || blkk != 128 || hdim_q != 128 || hdim_v != 128) + { + std::cout << "\n>>> TEST SKIPPED <<<" << std::endl; + std::cout << "VSA kernel instances are generated for BLKQ=BLKK=128, " + "hdim_q=128, hdim_v=128 only." + << std::endl; + std::cout << "TEST SKIPPED" << std::endl; + return true; + } + + ck_tile::index_t BLKQ = blkq; + ck_tile::index_t BLKK = blkk; + + ck_tile::index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ; + ck_tile::index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK; + + std::cout << "============================================================" << std::endl; + std::cout << "[Sparge -> VSA Sparse Attention Demo]" << std::endl; + std::cout << "============================================================" << std::endl; + std::cout << " Batch: " << batch << ", nhead_q: " << nhead << ", nhead_k: " << nhead_k + << std::endl; + std::cout << " seqlen_q: " << seqlen_q << ", seqlen_k: " << seqlen_k << std::endl; + std::cout << " hdim_q: " << hdim_q << ", hdim_v: " << hdim_v << std::endl; + std::cout << " BLKQ=" << BLKQ << ", BLKK=" << BLKK << std::endl; + std::cout << " num_q_blocks: " << num_q_blocks << ", num_k_blocks: " << num_k_blocks + << std::endl; + std::cout << " Sparge(simthreshd1=" << simthreshd1 << ", cdfthreshd=" << cdfthreshd + << ", topk=" << topk << ")" << std::endl; + std::cout << " i_perm: " << i_perm << ", o_perm: " << o_perm << std::endl; + + // Create host tensors + ck_tile::HostTensor q_host = make_qkv_tensor(batch, nhead, seqlen_q, hdim_q, i_perm); + ck_tile::HostTensor k_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_q, i_perm); + ck_tile::HostTensor v_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_v, i_perm); + ck_tile::HostTensor output_host = + o_perm ? ck_tile::HostTensor({batch, nhead, seqlen_q, hdim_v}) + : ck_tile::HostTensor({batch, seqlen_q, nhead, hdim_v}); + ck_tile::HostTensor output_ref({batch, nhead, seqlen_q, hdim_v}); + + std::cout << "\nInitializing tensors..." << std::endl; + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed}(q_host); + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 1}(k_host); + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 2}(v_host); + + // Build block map using Sparge tool + std::cout << "Building Sparge block map..." << std::endl; + sparge::SpargeParams p; + p.BLKQ = static_cast(BLKQ); + p.BLKK = static_cast(BLKK); + p.simthreshd1 = simthreshd1; + p.cdfthreshd = cdfthreshd; + p.topk = topk; + p.i_perm = i_perm; + + ck_tile::HostTensor block_relation_onehot = + sparge::build_block_map_meansim(q_host, k_host, p); + + // Convert to VSA LUT (delta-encoded) + valid_block_num + std::cout << "Converting block map to VSA LUT (delta)..." << std::endl; + auto vsa_lut = sparge::block_map_to_vsa_lut_delta(block_relation_onehot); + + // Print actual sparsity (based on one-hot) + std::size_t total_blocks = 0; + std::size_t active_blocks = 0; + for(ck_tile::index_t b = 0; b < batch; ++b) + { + for(ck_tile::index_t h = 0; h < nhead; ++h) + { + for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) + { + for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) + { + total_blocks++; + if(block_relation_onehot(b, h, qb, kb) != 0) + active_blocks++; + } + } + } + } + float actual_sparsity = + 1.0f - static_cast(active_blocks) / static_cast(total_blocks); + std::cout << " Actual sparsity: " << actual_sparsity << " (" << active_blocks << "/" + << total_blocks << " blocks active)" << std::endl; + + std::cout << "\n--- Running VSA sparse attention kernel ---" << std::endl; + + try + { + if(kname) + { + vsa_sparse_attention(q_host, + k_host, + v_host, + vsa_lut.lut, + vsa_lut.valid_block_num, + output_host, + batch, + nhead, + nhead_k, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + i_perm, + o_perm, + seqlen_q, + seqlen_k, + 1); + } + + for(int i = 0; i < warmup; ++i) + { + vsa_sparse_attention(q_host, + k_host, + v_host, + vsa_lut.lut, + vsa_lut.valid_block_num, + output_host, + batch, + nhead, + nhead_k, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + i_perm, + o_perm, + seqlen_q, + seqlen_k, + 0); + } + + [[maybe_unused]] auto sync_status1 = hipDeviceSynchronize(); + auto start = std::chrono::high_resolution_clock::now(); + + for(int i = 0; i < repeat; ++i) + { + vsa_sparse_attention(q_host, + k_host, + v_host, + vsa_lut.lut, + vsa_lut.valid_block_num, + output_host, + batch, + nhead, + nhead_k, + seqlen_q, + seqlen_k, + hdim_q, + hdim_v, + i_perm, + o_perm, + seqlen_q, + seqlen_k, + 0); + } + + [[maybe_unused]] auto sync_status2 = hipDeviceSynchronize(); + auto end = std::chrono::high_resolution_clock::now(); + double avg_time_ms = + std::chrono::duration(end - start).count() / repeat; + + std::cout << "\n>>>> VSA sparse attention average time: " << avg_time_ms << " ms <<<<" + << std::endl; + } + catch(const std::exception& e) + { + std::cerr << "Error during kernel execution: " << e.what() << std::endl; + return false; + } + + bool pass = true; + if(do_validation) + { + std::cout << "\n--- Performing CPU validation ---" << std::endl; + float scale = 1.0f / std::sqrt(static_cast(hdim_q)); + + std::cout << "Computing reference output..." << std::endl; + auto q_ref = to_bhsd(q_host, i_perm); + auto k_ref = to_bhsd(k_host, i_perm); + auto v_ref = to_bhsd(v_host, i_perm); + + ck_tile::reference_blocked_attention( + q_ref, k_ref, v_ref, block_relation_onehot, output_ref, BLKQ, BLKK, scale); + + auto [rtol, atol] = get_error_tolerance(); + + float max_diff = 0.0f; + float max_rel_diff = 0.0f; + std::size_t num_errors = 0; + + auto output_host_bhsd = to_bhsd(output_host, o_perm); + for(std::size_t i = 0; i < output_host_bhsd.mData.size(); ++i) + { + float gpu_val = to_float_for_compare(output_host_bhsd.mData[i]); + float ref_val = to_float_for_compare(output_ref.mData[i]); + float diff = std::abs(gpu_val - ref_val); + float rel_diff = (std::abs(ref_val) > 1e-6f) ? diff / std::abs(ref_val) : diff; + + max_diff = std::max(max_diff, diff); + max_rel_diff = std::max(max_rel_diff, rel_diff); + + if(diff > atol && rel_diff > rtol) + { + num_errors++; + if(num_errors <= 5) + { + std::cout << " Mismatch at index " << i << ": GPU=" << gpu_val + << ", Ref=" << ref_val << ", Diff=" << diff << std::endl; + } + } + } + + std::cout << "\nValidation results:" << std::endl; + std::cout << " Max absolute difference: " << max_diff << std::endl; + std::cout << " Max relative difference: " << max_rel_diff << std::endl; + std::cout << " Number of mismatches: " << num_errors << " / " + << output_host_bhsd.mData.size() << std::endl; + + if(num_errors == 0) + { + std::cout << "\n>>> VALIDATION PASSED <<<" << std::endl; + } + else + { + std::cout << "\n>>> VALIDATION FAILED <<<" << std::endl; + pass = false; + } + } + + std::cout << "\n" << (pass ? "TEST PASSED" : "TEST FAILED") << std::endl; + return pass; +} + +// ============================================================================ +// Main +// ============================================================================ + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + { + std::cerr << "Failed to parse arguments" << std::endl; + return -1; + } + + std::string prec = arg_parser.get_str("prec"); + + bool test_result = false; + if(prec == "fp16") + { + test_result = run_test(arg_parser); + } + else if(prec == "bf16") + { + test_result = run_test(arg_parser); + } + else + { + std::cerr << "Unsupported precision: " << prec << std::endl; + return -1; + } + + return test_result ? 0 : -1; +} From 9317fc4a8508cb53ec9bd829781d4781d84ce428 Mon Sep 17 00:00:00 2001 From: Gino Lu Date: Tue, 24 Mar 2026 05:57:54 -0400 Subject: [PATCH 2/7] Support 64x128 tile size in sparge fwd for Jenga and VSA paths --- example/ck_tile/50_sparse_attn/CMakeLists.txt | 116 ++- .../codegen/ops/sparge_fwd_jenga.py | 799 ++++++++++++++++++ .../codegen/ops/sparge_fwd_vsa.py | 799 ++++++++++++++++++ .../ck_tile/50_sparse_attn/fmha_fwd_trek.hpp | 6 + .../50_sparse_attn/jenga_sparge_attention.cpp | 189 +++++ .../50_sparse_attn/jenga_sparge_attention.h | 27 + .../test_sparge_jenga_sparse_attn.cpp | 14 +- .../test_sparge_vsa_sparse_attn.cpp | 14 +- .../50_sparse_attn/vsa_sparge_attention.cpp | 195 +++++ .../50_sparse_attn/vsa_sparge_attention.h | 28 + ...block_fmha_pipeline_qr_ks_vs_async_vsa.hpp | 2 +- 11 files changed, 2167 insertions(+), 22 deletions(-) create mode 100644 example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_jenga.py create mode 100644 example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_vsa.py create mode 100644 example/ck_tile/50_sparse_attn/jenga_sparge_attention.cpp create mode 100644 example/ck_tile/50_sparse_attn/jenga_sparge_attention.h create mode 100644 example/ck_tile/50_sparse_attn/vsa_sparge_attention.cpp create mode 100644 example/ck_tile/50_sparse_attn/vsa_sparge_attention.h diff --git a/example/ck_tile/50_sparse_attn/CMakeLists.txt b/example/ck_tile/50_sparse_attn/CMakeLists.txt index c916f642ebb..0ac86f6affa 100644 --- a/example/ck_tile/50_sparse_attn/CMakeLists.txt +++ b/example/ck_tile/50_sparse_attn/CMakeLists.txt @@ -1,8 +1,8 @@ -# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -# SPDX-License-Identifier: MIT -# CMakeLists.txt for sparse attention (Jenga and VSA) +#Copyright(c) Advanced Micro Devices, Inc., or its affiliates. +#SPDX - License - Identifier : MIT +#CMakeLists.txt for sparse attention(Jenga and VSA) -# Use SUPPORTED_GPU_TARGETS directly +#Use SUPPORTED_GPU_TARGETS directly set(INST_TARGETS ${SUPPORTED_GPU_TARGETS}) set(GPU_TARGETS ${SUPPORTED_GPU_TARGETS}) @@ -16,7 +16,7 @@ endif() message(STATUS "Building Sparse Attention (Jenga & VSA) for targets: ${INST_TARGETS}") -# Code generation scripts +#Code generation scripts file(GLOB_RECURSE CODE_GEN_SCRIPTS CONFIGURE_DEPENDS ${CMAKE_CURRENT_LIST_DIR}/generate.py ${CMAKE_CURRENT_LIST_DIR}/codegen/*.py @@ -88,11 +88,62 @@ target_compile_options(${EXAMPLE_JENGA_SPARSE_ATTN} PRIVATE -Wno-float-equal ) +# ============================================================================ +# Sparge Jenga (64x128 tile) +# ============================================================================ +set(SPARGE_JENGA_CODE_GEN_ARGS + ${CMAKE_CURRENT_LIST_DIR}/generate.py + --api sparge_fwd_jenga + --receipt 600 +) + +execute_process( + COMMAND ${Python3_EXECUTABLE} ${SPARGE_JENGA_CODE_GEN_ARGS} + --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/sparge_jenga_blob_list.txt + RESULT_VARIABLE ret +) +if(ret AND NOT ret EQUAL 0) + message(FATAL_ERROR "Failed to generate Sparge Jenga kernel list") +endif() + +file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/sparge_jenga_blob_list.txt SPARGE_JENGA_GEN_BLOBS) + +add_custom_command( + OUTPUT ${SPARGE_JENGA_GEN_BLOBS} + COMMAND ${Python3_EXECUTABLE} ${SPARGE_JENGA_CODE_GEN_ARGS} + --output_dir ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${CODE_GEN_SCRIPTS} + COMMENT "Generate CK Tile Sparge Jenga kernels" +) + +message(STATUS "Sparge Jenga kernel files to be generated: ${SPARGE_JENGA_GEN_BLOBS}") + +set(SPARGE_JENGA_INSTANCES "tile_sparge_jenga_instances") + +add_library(${SPARGE_JENGA_INSTANCES} OBJECT EXCLUDE_FROM_ALL + ${SPARGE_JENGA_GEN_BLOBS} + ${CMAKE_CURRENT_LIST_DIR}/jenga_sparge_attention.cpp +) +target_include_directories(${SPARGE_JENGA_INSTANCES} PRIVATE + ${CMAKE_CURRENT_LIST_DIR} + ${PROJECT_SOURCE_DIR}/include/ck_tile/ops/sparse_attn +) +set_source_files_properties(${SPARGE_JENGA_GEN_BLOBS} PROPERTIES LANGUAGE HIP) +set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/jenga_sparge_attention.cpp PROPERTIES LANGUAGE HIP) +set_property(TARGET ${SPARGE_JENGA_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS}) + +target_compile_options(${SPARGE_JENGA_INSTANCES} PRIVATE + -DCK_TILE_USE_BUFFER_ADDRESSING_BUILTIN + -DCK_TILE_FMHA_FWD_FAST_EXP2 + -Wno-undefined-func-template + -Wno-float-equal +) + # Sparge + Jenga Example executable set(EXAMPLE_SPARGE_JENGA_SPARSE_ATTN "tile_example_sparge_jenga_sparse_attn") message(DEBUG "adding example ${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN}") add_executable(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} EXCLUDE_FROM_ALL test_sparge_jenga_sparse_attn.cpp) -target_link_libraries(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} ${SPARSE_ATTN_JENGA_INSTANCES}) +target_link_libraries(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} ${SPARGE_JENGA_INSTANCES}) target_include_directories(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_compile_options(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} PRIVATE -Wno-undefined-func-template @@ -164,11 +215,62 @@ target_compile_options(${EXAMPLE_VSA_SPARSE_ATTN} PRIVATE -Wno-float-equal ) +# ============================================================================ +# Sparge VSA (64x128 tile) +# ============================================================================ +set(SPARGE_VSA_CODE_GEN_ARGS + ${CMAKE_CURRENT_LIST_DIR}/generate.py + --api sparge_fwd_vsa + --receipt 600 +) + +execute_process( + COMMAND ${Python3_EXECUTABLE} ${SPARGE_VSA_CODE_GEN_ARGS} + --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/sparge_vsa_blob_list.txt + RESULT_VARIABLE ret +) +if(ret AND NOT ret EQUAL 0) + message(FATAL_ERROR "Failed to generate Sparge VSA kernel list") +endif() + +file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/sparge_vsa_blob_list.txt SPARGE_VSA_GEN_BLOBS) + +add_custom_command( + OUTPUT ${SPARGE_VSA_GEN_BLOBS} + COMMAND ${Python3_EXECUTABLE} ${SPARGE_VSA_CODE_GEN_ARGS} + --output_dir ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${CODE_GEN_SCRIPTS} + COMMENT "Generate CK Tile Sparge VSA kernels" +) + +message(STATUS "Sparge VSA kernel files to be generated: ${SPARGE_VSA_GEN_BLOBS}") + +set(SPARGE_VSA_INSTANCES "tile_sparge_vsa_instances") + +add_library(${SPARGE_VSA_INSTANCES} OBJECT EXCLUDE_FROM_ALL + ${SPARGE_VSA_GEN_BLOBS} + ${CMAKE_CURRENT_LIST_DIR}/vsa_sparge_attention.cpp +) +target_include_directories(${SPARGE_VSA_INSTANCES} PRIVATE + ${CMAKE_CURRENT_LIST_DIR} + ${PROJECT_SOURCE_DIR}/include/ck_tile/ops/sparse_attn +) +set_source_files_properties(${SPARGE_VSA_GEN_BLOBS} PROPERTIES LANGUAGE HIP) +set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/vsa_sparge_attention.cpp PROPERTIES LANGUAGE HIP) +set_property(TARGET ${SPARGE_VSA_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS}) + +target_compile_options(${SPARGE_VSA_INSTANCES} PRIVATE + -DCK_TILE_USE_BUFFER_ADDRESSING_BUILTIN + -DCK_TILE_FMHA_FWD_FAST_EXP2 + -Wno-undefined-func-template + -Wno-float-equal +) + # Sparge + VSA Example executable set(EXAMPLE_SPARGE_VSA_SPARSE_ATTN "tile_example_sparge_vsa_sparse_attn") message(DEBUG "adding example ${EXAMPLE_SPARGE_VSA_SPARSE_ATTN}") add_executable(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} EXCLUDE_FROM_ALL test_sparge_vsa_sparse_attn.cpp) -target_link_libraries(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} ${SPARSE_ATTN_VSA_INSTANCES}) +target_link_libraries(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} ${SPARGE_VSA_INSTANCES}) target_include_directories(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_compile_options(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} PRIVATE -Wno-undefined-func-template diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_jenga.py b/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_jenga.py new file mode 100644 index 00000000000..872da2326ea --- /dev/null +++ b/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_jenga.py @@ -0,0 +1,799 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT +# generate kernel instances to speed up compilation + +import copy +from dataclasses import dataclass, field +import fnmatch +import itertools +import os +import os.path as path +from pathlib import Path +from typing import List, Optional, Tuple + +from codegen.cpp_symbol_map import ( + BOOL_MAP, + FWD_DTYPE_MAP, + LAYOUT_MAP, + MODE_MAP, + PIPELINE_ENUM_MAP, + PIPELINE_MAP, + get_mask_check_map, + get_mask_map, +) + +GEN_DIR = "" + + +def update_file(file_path, content): + """Update the file at file_path with the given content if it differs from the existing content. + + It avoids unnecessary touching of the file which triggers rebuilds + """ + + existing_content = "" + if path.exists(file_path): + with open(file_path, "r") as file: + existing_content = file.read() + if existing_content == content: + return + with open(file_path, "w") as file: + file.write(content) + + +DTYPE_BITS = {"fp32": 32, "fp16": 16, "bf16": 16} + +K0_MAX_SUBMAX_MAP = {32: 32, 64: 64, 96: 128, 128: 128, 192: 192, 256: 256} + +FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n +// auto generated by generate.py +#include "ck_tile/ops/fmha/block/variants.hpp" +#include "fmha_fwd_trek.hpp" +#include "pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp" +#include "kernel/fmha_fwd_jenga_kernel.hpp" + +""" + +# NOTE: Jenga sparse attention kernel has the following restrictions enforced by static_assert: +# - Group mode: NOT supported (batch mode only) +# - Bias: NOT supported (NO_BIAS only) +# - LSE output: NOT supported (false only) +# - Dropout: NOT supported (false only) +# - Logits soft-cap: NOT supported (false only) +# - FP8 static quantization: NOT supported (NO_SCALE only) +# The template below hardcodes these unsupported features accordingly. + +FMHA_FWD_KERNEL_BODY = """ +using fmha_dtype_{F_idx} = {F_dtype}; + +using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>; + +using fmha_shape_{F_idx} = ck_tile::TileFmhaShape, + ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>, + ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>, + ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>, + {F_vlayout}>; + +// TileFmhaTraits: spad, skpad, dpad, dvpad, has_logits_soft_cap, bias_enum, +// store_lse, has_dropout, has_randval, quant_scale_enum, occupancy, is_v_rowmajor_skip +using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad}, + {F_skpad}, + {F_dpad}, + {F_dvpad}, + false, // has_logits_soft_cap - NOT supported + ck_tile::BlockAttentionBiasEnum::NO_BIAS, // bias - NOT supported + false, // store_lse - NOT supported + false, // has_dropout - NOT supported + false, // has_randval - NOT supported + ck_tile::BlockAttentionQuantScaleEnum::NO_SCALE, // FP8 quant - NOT supported + {F_occupancy}, + false>; + +using fmha_variant_{F_idx} = ck_tile::ComposedAttention<0, CK_TILE_FMHA_FWD_FAST_EXP2>; // logits_soft_cap=0 (NOT supported) + +using fmha_mask_{F_idx} = {F_mask}; + +using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem< + typename FmhaSparseFwdTypeConfig::QDataType, + typename FmhaSparseFwdTypeConfig::KDataType, + typename FmhaSparseFwdTypeConfig::VDataType, + typename FmhaSparseFwdTypeConfig::SaccDataType, + typename FmhaSparseFwdTypeConfig::SMPLComputeDataType, + typename FmhaSparseFwdTypeConfig::BiasDataType, + typename FmhaSparseFwdTypeConfig::RandValOutputDataType, + typename FmhaSparseFwdTypeConfig::LSEDataType, + typename FmhaSparseFwdTypeConfig::PDataType, + typename FmhaSparseFwdTypeConfig::OaccDataType, + typename FmhaSparseFwdTypeConfig::ODataType, + fmha_shape_{F_idx}, + {F_mode}, + fmha_variant_{F_idx}, + fmha_mask_{F_idx}, + {F_trload}, + fmha_trait_{F_idx}>; + +using fmha_pipeline_{F_idx} = {F_pipeline}< + fmha_pipeline_problem_{F_idx}>; + +using fmha_epilogue_{F_idx} = + ck_tile::Default2DEpilogue::OaccDataType, + typename FmhaSparseFwdTypeConfig<{F_dtype}>::ODataType, + {F_spad}, {F_dvpad}>>; + +using fmha_kernel_{F_idx} = + ck_tile::FmhaFwdJengaKernel; + +using trait_{F_idx} = fmha_jenga_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, + {F_pipeline_enum}, false/*logits*/, fmha_mask_{F_idx}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; + +#include + +template<> +float fmha_jenga_fwd_(const ck_tile::stream_config& s, fmha_jenga_fwd_args a) +{{ + using k_ = fmha_kernel_{F_idx}; + if(s.log_level_ > 0) + std::cout << ", " << "{F_kernel_name}" << std::flush; + auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); + const dim3 blocks = k_::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; + return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); +}} +""" + +FMHA_FWD_API_FILENAME = "sparge_jenga_fwd_api.cpp" +FMHA_FWD_API = """ +#include + +#include + +namespace {{ +bool get_num_cus(unsigned& num_cus) {{ + int device; + auto status = hipGetDevice(&device); + if(status != hipSuccess) {{ + fprintf(stderr, "failed to get device"); + return false; + }} + + hipDeviceProp_t props{{}}; + status = hipGetDeviceProperties(&props, device); + if(status != hipSuccess) {{ + fprintf(stderr, "failed to get device properties"); + return false; + }} + + num_cus = props.multiProcessorCount; + return true; +}} + +unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{ + const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0; + const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1 + + return batch * nheads * num_m_blocks * num_n_blocks; +}} +}} // namespace + +float sparge_jenga_fwd(fmha_jenga_fwd_traits t, fmha_jenga_fwd_args a, const ck_tile::stream_config& s){{ + float r = -1; + + [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate + + unsigned num_cus; + if (!get_num_cus(num_cus)) {{ + return r; + }} + + [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{ + return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0); + }}; + + const bool has_load_tr = ck_tile::is_load_tr_supported(); + +{F_dispatch} + return r; +}} +""" + +FMHA_FWD_API_PER_TRLOAD = """ {F_if}({F_trload_cond}){{ +{F_dtype_case} + }} +""" + +FMHA_FWD_API_PER_DTYPE = """ {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{ +{F_hdim_case} + }} +""" +FMHA_FWD_API_PER_HDIM_CASE = """ {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{ +{F_inner_dispatch} + }} +""" + +FMHA_FWD_API_INNER_DISPATCH = """ {F_if}((t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && + ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{ + using trait_ = fmha_jenga_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, false/*logits*/, {F_mask}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; + return fmha_jenga_fwd_(s, a); + }} +""" + + +@dataclass +class CppConstraint: + bool_expr: str = None + + def __str__(self): + if self.bool_expr is None: + return "true" + else: + return f"{self.bool_expr}" + + def __and__(self, other): + return CppConstraint(f"({str(self)}) && ({str(other)})") + + +@dataclass +class FmhaFwdApiTrait: + pipeline_tag: str + # sync with fmha_fwd_traits<>, to generate fallback calls + hdim: str + dtype: str # data type + mode: str # value from MODE_MAP + bm0: int # tile size along q seqlen (block size) + bn0: int # tile size along qk seqlen + bk0: int # tile size along qk gemm unroll + bn1: int # tile size along v head_dim + bk1: int # tile size along kv gemm unroll + bk0max: int + vlayout: str + logits: str + mask: str + spad: str + skpad: str + dpad: str + dvpad: str + tr_load: str + constraint: CppConstraint + + @property + def name(self) -> str: + return ( + f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-" + + f"{self.vlayout}-{self.logits}-{self.mask}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}" + ) + + @property + def scheck(self) -> str: + if self.mode == "group": + return "true/*group mode spad always true*/" # group mode only generate spad/skpad == true + if self.spad == "t": + return "true" # always support + return "true" + + @property + def seqtune(self) -> str: + return "true" + + @property + def skcheck(self) -> str: + if self.mode == "group": + return "true/*group mode skpad always true*/" # group mode only generate spad/skpad == true + if self.skpad == "t": + return f"a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0" + return f"a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0" + + @property + def dcheck(self) -> str: + vec = int((32 * 4) / DTYPE_BITS[self.dtype]) + if self.dpad == "t": + return f"a.hdim_q % {vec} == 0" + assert False + + @property + def dvcheck(self) -> str: + vec = int((32 * 4) / DTYPE_BITS[self.dtype]) + if self.dvpad == "t": + return f"a.hdim_v % {vec} == 0" + assert False + + +@dataclass +class FmhaFwdPipeline: + tag: str + + F_vlayout: str # row/col + F_spad: str # true/false + F_skpad: str # + F_dpad: str # + F_dvpad: str # + F_logits: str # t/f + F_mask: str # value from MASK_MAP + F_trload: str # true/false + F_constraint: CppConstraint = field(default_factory=CppConstraint) + + @property + def name(self) -> str: + def pad_name() -> str: + n = "" + if self.F_spad == "t": + n += "s" + if self.F_skpad == "t": + n += "sk" + if self.F_dpad == "t": + n += "d" + if self.F_dvpad == "t": + n += "dv" + if n != "": + n = "p" + n + return n + + pn = pad_name() + n = f"{self.tag}_v{self.F_vlayout[0]}" + if pn != "": + n += f"_{pn}" + else: + n += "_npad" + + if self.F_logits == "t": + n += "_logits" + else: + n += "_nlogits" + + n += "_nbias" + + if self.F_mask[0:2] == "s_": + if self.F_mask == "s_mask": + n += "_mask" + else: + n += "_nmask" + else: + if self.F_mask != "no": + n += f"_m{self.F_mask[0]}" + else: + n += "_nmask" + + n += "_nskip" + + n += "_nsquant" + + if self.F_trload == "t": + n += "_trload" + else: + n += "_ntrload" + + return n + + +class FmhaFwdApiPool: + def __init__(self, mask_impl): + self.pool = dict() + self.mask_impl = mask_impl + + def register_traits(self, trait: FmhaFwdApiTrait) -> None: + # TODO: do we need to check duplication? + if trait.dtype not in self.pool.keys(): + self.pool[trait.dtype] = dict() + hdim = trait.hdim, trait.bn1 + if hdim not in self.pool[trait.dtype].keys(): + self.pool[trait.dtype][hdim] = list() + + self.pool[trait.dtype][hdim].append(copy.copy(trait)) + + @property + def api(self) -> str: + tr_load_cond_map = {"t": "has_load_tr", "f": "true"} + + per_tr_load = str() + for tr_load in ["t", "f"]: + per_dtypes = str() + for i, dtype in enumerate(self.pool.keys()): + per_hdim_case = str() + for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()): + traits = [ + t + for t in self.pool[dtype][(hdim, hdim_v)] + if tr_load == t.tr_load + ] + inners = str() + for k, trait in enumerate(traits): + if_k = "if" if k == 0 else "else if" + inners = inners + FMHA_FWD_API_INNER_DISPATCH.format( + F_if=if_k, + F_vlayout=LAYOUT_MAP[trait.vlayout], + F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], + # F_logits removed - hardcoded to false (NOT supported) + F_mask=get_mask_map(self.mask_impl)[trait.mask], + F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], + F_trload=BOOL_MAP[trait.tr_load], + F_scheck=trait.scheck, + F_seqtune=trait.seqtune, + F_skcheck=trait.skcheck, + F_dcheck=trait.dcheck, + F_dvcheck=trait.dvcheck, + F_constraint=trait.constraint, + F_spad=BOOL_MAP[trait.spad], + F_skpad=BOOL_MAP[trait.skpad], + F_dpad=BOOL_MAP[trait.dpad], + F_dvpad=BOOL_MAP[trait.dvpad], + F_bm0=trait.bm0, + F_bn0=trait.bn0, + F_bk0=trait.bk0, + F_bn1=trait.bn1, + F_bk1=trait.bk1, + F_bk0max=trait.bk0max, + F_hdim=hdim, + F_dtype=FWD_DTYPE_MAP[dtype], + ) + if_j = "if" if j == 0 else "else if" + per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format( + F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners + ) + if_i = "if" if i == 0 else "else if" + per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format( + F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case + ) + per_tr_load += FMHA_FWD_API_PER_TRLOAD.format( + F_if="if", + F_trload_cond=tr_load_cond_map[tr_load], + F_dtype_case=per_dtypes, + ) + if not per_tr_load: + # empty string we add some ignore to suppress warning in api + per_tr_load += " (void)t ; (void)s ; (void)a;" + return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_tr_load) + + +@dataclass +class FmhaFwdTileSize: + F_bm0: int # tile size along q seqlen (block size) + F_bn0: int # tile size along k seqlen + F_bk0: int # tile size along qk gemm unroll + F_bn1: int # tile size along v head_dim + F_bk1: int # tile size along kv gemm unroll + F_bk0max: int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) + F_rm0: int # number of warps for gemm0 along q seqlen + F_rn0: int # number of warps for gemm0 along k seqlen + F_rk0: int # number of warps for gemm0 along head dim q (not used) + F_rm1: int # number of warps for gemm1 along q seqlen + F_rn1: int # number of warps for gemm1 along head dim v + F_rk1: int # number of warps for gemm1 along k seqlen (not used) + F_wm0: int # gemm0 warp size along m + F_wn0: int # gemm0 warp size along n + F_wk0: int # gemm0 warp size along k + F_wm1: int # gemm1 warp size along m + F_wn1: int # gemm1 warp size along n + F_wk1: int # gemm1 warp size along k + F_occupancy: int # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy + F_constraint: CppConstraint = field(default_factory=CppConstraint) + + @property + def name(self) -> str: + return ( + f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" + + f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" + + f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" + + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") + ) + + +@dataclass +class FmhaFwdKernel: + F_idx: int # this is not a tunable, but a counter to differentiate symbol + F_hdim: int # hdim + F_dtype: str # data type + F_mode: str # value from MODE_MAP + F_tile: FmhaFwdTileSize + F_pipeline: FmhaFwdPipeline + mask_impl: str + + @property + def template(self) -> str: + # kernel_body removed - unused + return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_KERNEL_BODY.format( + F_idx=self.F_idx, + F_hdim=self.F_hdim, + F_dtype=FWD_DTYPE_MAP[self.F_dtype], + F_bm0=self.F_tile.F_bm0, + F_bn0=self.F_tile.F_bn0, + F_bk0=self.F_tile.F_bk0, + F_bn1=self.F_tile.F_bn1, + F_bk1=self.F_tile.F_bk1, + F_bk0max=self.F_tile.F_bk0max, + F_rm0=self.F_tile.F_rm0, + F_rn0=self.F_tile.F_rn0, + F_rk0=self.F_tile.F_rk0, + F_rm1=self.F_tile.F_rm1, + F_rn1=self.F_tile.F_rn1, + F_rk1=self.F_tile.F_rk1, + F_wm0=self.F_tile.F_wm0, + F_wn0=self.F_tile.F_wn0, + F_wk0=self.F_tile.F_wk0, + F_wm1=self.F_tile.F_wm1, + F_wn1=self.F_tile.F_wn1, + F_wk1=self.F_tile.F_wk1, + F_vlayout=LAYOUT_MAP[self.F_pipeline.F_vlayout], + F_spad=BOOL_MAP[self.F_pipeline.F_spad], + F_skpad=BOOL_MAP[self.F_pipeline.F_skpad], + F_dpad=BOOL_MAP[self.F_pipeline.F_dpad], + F_dvpad=BOOL_MAP[self.F_pipeline.F_dvpad], + # F_logits removed - hardcoded to false in template (NOT supported) + F_occupancy=self.F_tile.F_occupancy, + F_pipeline_enum=PIPELINE_ENUM_MAP[self.F_pipeline.tag], + F_mask=get_mask_map(self.mask_impl)[self.F_pipeline.F_mask], + F_mode=MODE_MAP[self.F_mode], + F_pipeline=PIPELINE_MAP[self.F_pipeline.tag], + F_trload=BOOL_MAP[self.F_pipeline.F_trload], + F_kernel_name=self.name, + ) + + @property + def name(self) -> str: + # TODO: we don't encode idx here + return ( + f"fmha_jenga_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + + self.F_tile.name + + "_" + + self.F_pipeline.name + ) + + @property + def filename(self) -> str: + return self.name + ".cpp" + + def api_trait(self) -> FmhaFwdApiTrait: + return FmhaFwdApiTrait( + pipeline_tag=self.F_pipeline.tag, + hdim=str(self.F_hdim), + dtype=self.F_dtype, + mode=self.F_mode, + bm0=self.F_tile.F_bm0, + bn0=self.F_tile.F_bn0, + bk0=self.F_tile.F_bk0, + bn1=self.F_tile.F_bn1, + bk1=self.F_tile.F_bk1, + bk0max=self.F_tile.F_bk0max, + vlayout=self.F_pipeline.F_vlayout, + mask=self.F_pipeline.F_mask, + logits=self.F_pipeline.F_logits, + spad=self.F_pipeline.F_spad, + skpad=self.F_pipeline.F_skpad, + dpad=self.F_pipeline.F_dpad, + dvpad=self.F_pipeline.F_dvpad, + tr_load=self.F_pipeline.F_trload, + constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint, + ) + + +class KernelComponentFactory: + # TODO: design a more practical way to do it + # this is current supported tile size per hdim + @staticmethod + def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: + if dtype == "fp16" or dtype == "bf16": + return { + # (32, 32) : [FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + # (64, 64) : [FmhaFwdTileSize(16, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1), + # FmhaFwdTileSize(32, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1), + # FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + (128, 128): [ + FmhaFwdTileSize( + 64, + 128, + 64, + 128, + 64, + 128, + 4, + 1, + 1, + 4, + 1, + 1, + 16, + 16, + 16, + 16, + 16, + 16, + -1, + ), + ], + # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], + # (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + # (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], + # (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + } + else: + return None + + # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad + # support this in future + @staticmethod + def get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) -> List[FmhaFwdPipeline]: + # this function will populate a list possible pipelines + # TODO: the order of List matters! the later in this list will be also be checked later + # NOTE: logits soft-cap is NOT supported by Jenga sparse attention (enforced by static_assert) + pipelines = [] + if dtype in ["fp16", "bf16"]: + for logits, mask in itertools.product( + ["f"], # logits soft-cap NOT supported, always false + get_mask_map(mask_impl).keys(), + ): + if hdim == 256 and hdim_v == 256: + # jenga fmha only supports dim <= 192 for now. + continue + pipelines.append( + FmhaFwdPipeline( # fmt: skip + "qr_async", + "row", + "t", + "f", + "t", + "t", + logits, + mask, + "f", + ) + ) + pipelines.append( + FmhaFwdPipeline( # fmt: skip + "qr_async", + "row", + "t", + "t", + "t", + "t", + logits, + mask, + "f", + ) + ) + else: + assert False + return pipelines + + +class CustomFactory(KernelComponentFactory): + @staticmethod + def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: + result = KernelComponentFactory.get_hdim_tile_size_dict(dtype) + if dtype == "fp16" or dtype == "bf16": + if (128, 128) in result.keys(): + result[(128, 128)].insert( + 0, + FmhaFwdTileSize( + 64, + 128, + 64, + 128, + 64, + 128, + 4, + 1, + 1, + 4, + 1, + 1, + 16, + 16, + 16, + 16, + 16, + 16, + -1, + CppConstraint( + "get_num_blocks(128) < num_cus * min_cu_util_rate" + ), + ), + ) + return result + + +def get_fwd_blobs( + kernel_filter: Optional[str], receipt, optdim_list, mask_impl +) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]: + gen = list() + api_pool = FmhaFwdApiPool(mask_impl) + + factory = ( + CustomFactory + if os.environ.get("CK_TILE_FMHA_FWD_CUSTOM_FACTORY", "0") == "1" + else KernelComponentFactory + ) + + # Only generate fp16/bf16 kernels for now. + # NOTE: Jenga sparse attention only supports batch mode (group mode NOT supported, enforced by static_assert) + for dtype in ["fp16", "bf16"]: + d = factory.get_hdim_tile_size_dict(dtype) + if d is None: + continue + for ((hdim, hdim_v), tiles), mode in itertools.product(d.items(), ["batch"]): + for tile, pipeline in itertools.product( + tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) + ): + if pipeline.tag != "qr_async": + continue + k = FmhaFwdKernel( + F_idx=2, + F_hdim=hdim, + F_dtype=dtype, + F_mode=mode, + F_tile=tile, + F_pipeline=pipeline, + mask_impl=mask_impl, + ) + if kernel_filter != "": + if not fnmatch.fnmatch(k.name, kernel_filter): + continue + if optdim_list != [-1]: + if hdim not in optdim_list: + continue + # 2 - Flash attention integration + if receipt in (2, 3): + cond = dtype in ["fp16", "bf16"] + cond &= pipeline.F_vlayout == "row" + if not cond: + continue + # PyTorch integration + elif receipt == 4: + cond = dtype in ["fp16", "bf16"] + cond &= pipeline.F_vlayout == "row" + cond &= mode == "batch" + cond &= pipeline.F_logits == "f" + if not cond: + continue + # Aiter(mha_fwd) integration + elif receipt == 100: + cond = dtype in ["fp16", "bf16"] + cond &= mode == "batch" + cond &= pipeline.F_vlayout == "row" + if not cond: + continue + # Aiter(mha_varlen_fwd) integration + elif receipt == 200: + cond = dtype in ["fp16", "bf16"] + cond &= mode == "group" + cond &= pipeline.F_vlayout == "row" + if not cond: + continue + # aiter::mha_fwd C++ api integration + elif receipt == 600: + cond = dtype in ["fp16", "bf16"] + cond &= pipeline.F_vlayout == "row" + if not cond: + continue + + api_pool.register_traits(k.api_trait()) + gen.append(k) + + return (api_pool, gen) + + +def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None: + update_file(autogen_dir / kernel.filename, kernel.template) + + +def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None: + update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api) + + +def write_blobs( + output_dir: Path, kernel_filter: str, receipt, optdim_list, mask_impl +) -> None: + api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl) + for kernel in kernels: + write_single_fwd_kernel(kernel, output_dir) + write_fwd_api(api_pool, output_dir) + + +def list_blobs( + file_path: Path, kernel_filter: str, receipt, optdim_list, mask_impl +) -> None: + with file_path.open("a") as f: + _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl) + for kernel in kernels: + f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n") + f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n") diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_vsa.py b/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_vsa.py new file mode 100644 index 00000000000..c9a389df3fa --- /dev/null +++ b/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_vsa.py @@ -0,0 +1,799 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT +# generate kernel instances to speed up compilation + +import copy +from dataclasses import dataclass, field +import fnmatch +import itertools +import os +import os.path as path +from pathlib import Path +from typing import List, Optional, Tuple + +from codegen.cpp_symbol_map import ( + BOOL_MAP, + FWD_DTYPE_MAP, + LAYOUT_MAP, + MODE_MAP, + PIPELINE_ENUM_MAP, + PIPELINE_MAP, + get_mask_check_map, + get_mask_map, +) + +GEN_DIR = "" + + +def update_file(file_path, content): + """Update the file at file_path with the given content if it differs from the existing content. + + It avoids unnecessary touching of the file which triggers rebuilds + """ + + existing_content = "" + if path.exists(file_path): + with open(file_path, "r") as file: + existing_content = file.read() + if existing_content == content: + return + with open(file_path, "w") as file: + file.write(content) + + +DTYPE_BITS = {"fp32": 32, "fp16": 16, "bf16": 16} + +K0_MAX_SUBMAX_MAP = {32: 32, 64: 64, 96: 128, 128: 128, 192: 192, 256: 256} + +FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n +// auto generated by generate.py +#include "ck_tile/ops/fmha/block/variants.hpp" +#include "fmha_fwd_trek.hpp" +#include "pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp" +#include "kernel/fmha_fwd_vsa_kernel.hpp" + +""" + +# NOTE: VSA sparse attention kernel has the following restrictions enforced by static_assert: +# - Group mode: NOT supported (batch mode only) +# - Bias: NOT supported (NO_BIAS only) +# - LSE output: NOT supported (false only) +# - Dropout: NOT supported (false only) +# - Logits soft-cap: NOT supported (false only) +# - FP8 static quantization: NOT supported (NO_SCALE only) +# The template below hardcodes these unsupported features accordingly. + +FMHA_FWD_KERNEL_BODY = """ +using fmha_dtype_{F_idx} = {F_dtype}; + +using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>; + +using fmha_shape_{F_idx} = ck_tile::TileFmhaShape, + ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>, + ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>, + ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>, + {F_vlayout}>; + +// TileFmhaTraits: spad, skpad, dpad, dvpad, has_logits_soft_cap, bias_enum, +// store_lse, has_dropout, has_randval, quant_scale_enum, occupancy, is_v_rowmajor_skip +using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad}, + {F_skpad}, + {F_dpad}, + {F_dvpad}, + false, // has_logits_soft_cap - NOT supported + ck_tile::BlockAttentionBiasEnum::NO_BIAS, // bias - NOT supported + false, // store_lse - NOT supported + false, // has_dropout - NOT supported + false, // has_randval - NOT supported + ck_tile::BlockAttentionQuantScaleEnum::NO_SCALE, // FP8 quant - NOT supported + {F_occupancy}, + false>; + +using fmha_variant_{F_idx} = ck_tile::ComposedAttention<0, CK_TILE_FMHA_FWD_FAST_EXP2>; // logits_soft_cap=0 (NOT supported) + +using fmha_mask_{F_idx} = {F_mask}; + +using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem< + typename FmhaSparseFwdTypeConfig::QDataType, + typename FmhaSparseFwdTypeConfig::KDataType, + typename FmhaSparseFwdTypeConfig::VDataType, + typename FmhaSparseFwdTypeConfig::SaccDataType, + typename FmhaSparseFwdTypeConfig::SMPLComputeDataType, + typename FmhaSparseFwdTypeConfig::BiasDataType, + typename FmhaSparseFwdTypeConfig::RandValOutputDataType, + typename FmhaSparseFwdTypeConfig::LSEDataType, + typename FmhaSparseFwdTypeConfig::PDataType, + typename FmhaSparseFwdTypeConfig::OaccDataType, + typename FmhaSparseFwdTypeConfig::ODataType, + fmha_shape_{F_idx}, + {F_mode}, + fmha_variant_{F_idx}, + fmha_mask_{F_idx}, + {F_trload}, + fmha_trait_{F_idx}>; + +using fmha_pipeline_{F_idx} = ck_tile::BlockFmhaPipelineQRKSVSAsyncVSA< + fmha_pipeline_problem_{F_idx}>; + +using fmha_epilogue_{F_idx} = + ck_tile::Default2DEpilogue::OaccDataType, + typename FmhaSparseFwdTypeConfig<{F_dtype}>::ODataType, + {F_spad}, {F_dvpad}>>; + +using fmha_kernel_{F_idx} = + ck_tile::FmhaFwdVSAKernel; + +using trait_{F_idx} = fmha_vsa_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, + {F_pipeline_enum}, false/*logits*/, fmha_mask_{F_idx}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; + +#include + +template<> +float fmha_vsa_fwd_(const ck_tile::stream_config& s, fmha_vsa_fwd_args a) +{{ + using k_ = fmha_kernel_{F_idx}; + if(s.log_level_ > 0) + std::cout << ", " << "{F_kernel_name}" << std::flush; + auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); + const dim3 blocks = k_::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; + return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); +}} +""" + +FMHA_FWD_API_FILENAME = "sparge_vsa_fwd_api.cpp" +FMHA_FWD_API = """ +#include + +#include + +namespace {{ +bool get_num_cus(unsigned& num_cus) {{ + int device; + auto status = hipGetDevice(&device); + if(status != hipSuccess) {{ + fprintf(stderr, "failed to get device"); + return false; + }} + + hipDeviceProp_t props{{}}; + status = hipGetDeviceProperties(&props, device); + if(status != hipSuccess) {{ + fprintf(stderr, "failed to get device properties"); + return false; + }} + + num_cus = props.multiProcessorCount; + return true; +}} + +unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{ + const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0; + const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1 + + return batch * nheads * num_m_blocks * num_n_blocks; +}} +}} // namespace + +float sparge_vsa_fwd(fmha_vsa_fwd_traits t, fmha_vsa_fwd_args a, const ck_tile::stream_config& s){{ + float r = -1; + + [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate + + unsigned num_cus; + if (!get_num_cus(num_cus)) {{ + return r; + }} + + [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{ + return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0); + }}; + + const bool has_load_tr = ck_tile::is_load_tr_supported(); + +{F_dispatch} + return r; +}} +""" + +FMHA_FWD_API_PER_TRLOAD = """ {F_if}({F_trload_cond}){{ +{F_dtype_case} + }} +""" + +FMHA_FWD_API_PER_DTYPE = """ {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{ +{F_hdim_case} + }} +""" +FMHA_FWD_API_PER_HDIM_CASE = """ {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{ +{F_inner_dispatch} + }} +""" + +FMHA_FWD_API_INNER_DISPATCH = """ {F_if}((t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && + ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{ + using trait_ = fmha_vsa_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, false/*logits*/, {F_mask}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; + return fmha_vsa_fwd_(s, a); + }} +""" + + +@dataclass +class CppConstraint: + bool_expr: str = None + + def __str__(self): + if self.bool_expr is None: + return "true" + else: + return f"{self.bool_expr}" + + def __and__(self, other): + return CppConstraint(f"({str(self)}) && ({str(other)})") + + +@dataclass +class FmhaFwdApiTrait: + pipeline_tag: str + # sync with fmha_fwd_traits<>, to generate fallback calls + hdim: str + dtype: str # data type + mode: str # value from MODE_MAP + bm0: int # tile size along q seqlen (block size) + bn0: int # tile size along qk seqlen + bk0: int # tile size along qk gemm unroll + bn1: int # tile size along v head_dim + bk1: int # tile size along kv gemm unroll + bk0max: int + vlayout: str + logits: str + mask: str + spad: str + skpad: str + dpad: str + dvpad: str + tr_load: str + constraint: CppConstraint + + @property + def name(self) -> str: + return ( + f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-" + + f"{self.vlayout}-{self.logits}-{self.mask}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}" + ) + + @property + def scheck(self) -> str: + if self.mode == "group": + return "true/*group mode spad always true*/" # group mode only generate spad/skpad == true + if self.spad == "t": + return "true" # always support + return "true" + + @property + def seqtune(self) -> str: + return "true" + + @property + def skcheck(self) -> str: + if self.mode == "group": + return "true/*group mode skpad always true*/" # group mode only generate spad/skpad == true + if self.skpad == "t": + return f"a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0" + return f"a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0" + + @property + def dcheck(self) -> str: + vec = int((32 * 4) / DTYPE_BITS[self.dtype]) + if self.dpad == "t": + return f"a.hdim_q % {vec} == 0" + assert False + + @property + def dvcheck(self) -> str: + vec = int((32 * 4) / DTYPE_BITS[self.dtype]) + if self.dvpad == "t": + return f"a.hdim_v % {vec} == 0" + assert False + + +@dataclass +class FmhaFwdPipeline: + tag: str + + F_vlayout: str # row/col + F_spad: str # true/false + F_skpad: str # + F_dpad: str # + F_dvpad: str # + F_logits: str # t/f + F_mask: str # value from MASK_MAP + F_trload: str # true/false + F_constraint: CppConstraint = field(default_factory=CppConstraint) + + @property + def name(self) -> str: + def pad_name() -> str: + n = "" + if self.F_spad == "t": + n += "s" + if self.F_skpad == "t": + n += "sk" + if self.F_dpad == "t": + n += "d" + if self.F_dvpad == "t": + n += "dv" + if n != "": + n = "p" + n + return n + + pn = pad_name() + n = f"{self.tag}_v{self.F_vlayout[0]}" + if pn != "": + n += f"_{pn}" + else: + n += "_npad" + + if self.F_logits == "t": + n += "_logits" + else: + n += "_nlogits" + + n += "_nbias" + + if self.F_mask[0:2] == "s_": + if self.F_mask == "s_mask": + n += "_mask" + else: + n += "_nmask" + else: + if self.F_mask != "no": + n += f"_m{self.F_mask[0]}" + else: + n += "_nmask" + + n += "_nskip" + + n += "_nsquant" + + if self.F_trload == "t": + n += "_trload" + else: + n += "_ntrload" + + return n + + +class FmhaFwdApiPool: + def __init__(self, mask_impl): + self.pool = dict() + self.mask_impl = mask_impl + + def register_traits(self, trait: FmhaFwdApiTrait) -> None: + # TODO: do we need to check duplication? + if trait.dtype not in self.pool.keys(): + self.pool[trait.dtype] = dict() + hdim = trait.hdim, trait.bn1 + if hdim not in self.pool[trait.dtype].keys(): + self.pool[trait.dtype][hdim] = list() + + self.pool[trait.dtype][hdim].append(copy.copy(trait)) + + @property + def api(self) -> str: + tr_load_cond_map = {"t": "has_load_tr", "f": "true"} + + per_tr_load = str() + for tr_load in ["t", "f"]: + per_dtypes = str() + for i, dtype in enumerate(self.pool.keys()): + per_hdim_case = str() + for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()): + traits = [ + t + for t in self.pool[dtype][(hdim, hdim_v)] + if tr_load == t.tr_load + ] + inners = str() + for k, trait in enumerate(traits): + if_k = "if" if k == 0 else "else if" + inners = inners + FMHA_FWD_API_INNER_DISPATCH.format( + F_if=if_k, + F_vlayout=LAYOUT_MAP[trait.vlayout], + F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], + # F_logits removed - hardcoded to false (NOT supported) + F_mask=get_mask_map(self.mask_impl)[trait.mask], + F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], + F_trload=BOOL_MAP[trait.tr_load], + F_scheck=trait.scheck, + F_seqtune=trait.seqtune, + F_skcheck=trait.skcheck, + F_dcheck=trait.dcheck, + F_dvcheck=trait.dvcheck, + F_constraint=trait.constraint, + F_spad=BOOL_MAP[trait.spad], + F_skpad=BOOL_MAP[trait.skpad], + F_dpad=BOOL_MAP[trait.dpad], + F_dvpad=BOOL_MAP[trait.dvpad], + F_bm0=trait.bm0, + F_bn0=trait.bn0, + F_bk0=trait.bk0, + F_bn1=trait.bn1, + F_bk1=trait.bk1, + F_bk0max=trait.bk0max, + F_hdim=hdim, + F_dtype=FWD_DTYPE_MAP[dtype], + ) + if_j = "if" if j == 0 else "else if" + per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format( + F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners + ) + if_i = "if" if i == 0 else "else if" + per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format( + F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case + ) + per_tr_load += FMHA_FWD_API_PER_TRLOAD.format( + F_if="if", + F_trload_cond=tr_load_cond_map[tr_load], + F_dtype_case=per_dtypes, + ) + if not per_tr_load: + # empty string we add some ignore to suppress warning in api + per_tr_load += " (void)t ; (void)s ; (void)a;" + return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_tr_load) + + +@dataclass +class FmhaFwdTileSize: + F_bm0: int # tile size along q seqlen (block size) + F_bn0: int # tile size along k seqlen + F_bk0: int # tile size along qk gemm unroll + F_bn1: int # tile size along v head_dim + F_bk1: int # tile size along kv gemm unroll + F_bk0max: int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) + F_rm0: int # number of warps for gemm0 along q seqlen + F_rn0: int # number of warps for gemm0 along k seqlen + F_rk0: int # number of warps for gemm0 along head dim q (not used) + F_rm1: int # number of warps for gemm1 along q seqlen + F_rn1: int # number of warps for gemm1 along head dim v + F_rk1: int # number of warps for gemm1 along k seqlen (not used) + F_wm0: int # gemm0 warp size along m + F_wn0: int # gemm0 warp size along n + F_wk0: int # gemm0 warp size along k + F_wm1: int # gemm1 warp size along m + F_wn1: int # gemm1 warp size along n + F_wk1: int # gemm1 warp size along k + F_occupancy: int # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy + F_constraint: CppConstraint = field(default_factory=CppConstraint) + + @property + def name(self) -> str: + return ( + f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" + + f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" + + f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" + + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") + ) + + +@dataclass +class FmhaFwdKernel: + F_idx: int # this is not a tunable, but a counter to differentiate symbol + F_hdim: int # hdim + F_dtype: str # data type + F_mode: str # value from MODE_MAP + F_tile: FmhaFwdTileSize + F_pipeline: FmhaFwdPipeline + mask_impl: str + + @property + def template(self) -> str: + # kernel_body removed - unused + return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_KERNEL_BODY.format( + F_idx=self.F_idx, + F_hdim=self.F_hdim, + F_dtype=FWD_DTYPE_MAP[self.F_dtype], + F_bm0=self.F_tile.F_bm0, + F_bn0=self.F_tile.F_bn0, + F_bk0=self.F_tile.F_bk0, + F_bn1=self.F_tile.F_bn1, + F_bk1=self.F_tile.F_bk1, + F_bk0max=self.F_tile.F_bk0max, + F_rm0=self.F_tile.F_rm0, + F_rn0=self.F_tile.F_rn0, + F_rk0=self.F_tile.F_rk0, + F_rm1=self.F_tile.F_rm1, + F_rn1=self.F_tile.F_rn1, + F_rk1=self.F_tile.F_rk1, + F_wm0=self.F_tile.F_wm0, + F_wn0=self.F_tile.F_wn0, + F_wk0=self.F_tile.F_wk0, + F_wm1=self.F_tile.F_wm1, + F_wn1=self.F_tile.F_wn1, + F_wk1=self.F_tile.F_wk1, + F_vlayout=LAYOUT_MAP[self.F_pipeline.F_vlayout], + F_spad=BOOL_MAP[self.F_pipeline.F_spad], + F_skpad=BOOL_MAP[self.F_pipeline.F_skpad], + F_dpad=BOOL_MAP[self.F_pipeline.F_dpad], + F_dvpad=BOOL_MAP[self.F_pipeline.F_dvpad], + # F_logits removed - hardcoded to false in template (NOT supported) + F_occupancy=self.F_tile.F_occupancy, + F_pipeline_enum=PIPELINE_ENUM_MAP[self.F_pipeline.tag], + F_mask=get_mask_map(self.mask_impl)[self.F_pipeline.F_mask], + F_mode=MODE_MAP[self.F_mode], + F_pipeline=PIPELINE_MAP[self.F_pipeline.tag], + F_trload=BOOL_MAP[self.F_pipeline.F_trload], + F_kernel_name=self.name, + ) + + @property + def name(self) -> str: + # TODO: we don't encode idx here + return ( + f"fmha_vsa_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + + self.F_tile.name + + "_" + + self.F_pipeline.name + ) + + @property + def filename(self) -> str: + return self.name + ".cpp" + + def api_trait(self) -> FmhaFwdApiTrait: + return FmhaFwdApiTrait( + pipeline_tag=self.F_pipeline.tag, + hdim=str(self.F_hdim), + dtype=self.F_dtype, + mode=self.F_mode, + bm0=self.F_tile.F_bm0, + bn0=self.F_tile.F_bn0, + bk0=self.F_tile.F_bk0, + bn1=self.F_tile.F_bn1, + bk1=self.F_tile.F_bk1, + bk0max=self.F_tile.F_bk0max, + vlayout=self.F_pipeline.F_vlayout, + mask=self.F_pipeline.F_mask, + logits=self.F_pipeline.F_logits, + spad=self.F_pipeline.F_spad, + skpad=self.F_pipeline.F_skpad, + dpad=self.F_pipeline.F_dpad, + dvpad=self.F_pipeline.F_dvpad, + tr_load=self.F_pipeline.F_trload, + constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint, + ) + + +class KernelComponentFactory: + # TODO: design a more practical way to do it + # this is current supported tile size per hdim + @staticmethod + def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: + if dtype == "fp16" or dtype == "bf16": + return { + # (32, 32) : [FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + # (64, 64) : [FmhaFwdTileSize(16, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1), + # FmhaFwdTileSize(32, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1), + # FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + (128, 128): [ + FmhaFwdTileSize( + 64, + 128, + 64, + 128, + 64, + 128, + 4, + 1, + 1, + 4, + 1, + 1, + 16, + 16, + 16, + 16, + 16, + 16, + -1, + ), + ], + # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], + # (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + # (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], + # (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], + } + else: + return None + + # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad + # support this in future + @staticmethod + def get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) -> List[FmhaFwdPipeline]: + # this function will populate a list possible pipelines + # TODO: the order of List matters! the later in this list will be also be checked later + # NOTE: logits soft-cap is NOT supported by VSA sparse attention (enforced by static_assert) + pipelines = [] + if dtype in ["fp16", "bf16"]: + for logits, mask in itertools.product( + ["f"], # logits soft-cap NOT supported, always false + get_mask_map(mask_impl).keys(), + ): + if hdim == 256 and hdim_v == 256: + # vsa fmha only supports dim <= 192 for now. + continue + pipelines.append( + FmhaFwdPipeline( + "qr_async_vsa", + "row", + "t", + "f", + "t", + "t", + logits, + mask, + "f", + ) + ) + pipelines.append( + FmhaFwdPipeline( + "qr_async_vsa", + "row", + "t", + "t", + "t", + "t", + logits, + mask, + "f", + ) + ) + else: + assert False + return pipelines + + +class CustomFactory(KernelComponentFactory): + @staticmethod + def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: + result = KernelComponentFactory.get_hdim_tile_size_dict(dtype) + if dtype == "fp16" or dtype == "bf16": + if (128, 128) in result.keys(): + result[(128, 128)].insert( + 0, + FmhaFwdTileSize( + 64, + 128, + 64, + 128, + 64, + 128, + 4, + 1, + 1, + 4, + 1, + 1, + 16, + 16, + 16, + 16, + 16, + 16, + -1, + CppConstraint( + "get_num_blocks(128) < num_cus * min_cu_util_rate" + ), + ), + ) + return result + + +def get_fwd_blobs( + kernel_filter: Optional[str], receipt, optdim_list, mask_impl +) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]: + gen = list() + api_pool = FmhaFwdApiPool(mask_impl) + + factory = ( + CustomFactory + if os.environ.get("CK_TILE_FMHA_FWD_CUSTOM_FACTORY", "0") == "1" + else KernelComponentFactory + ) + + # Only generate fp16/bf16 kernels for now. + # NOTE: VSA sparse attention only supports batch mode (group mode NOT supported, enforced by static_assert) + for dtype in ["fp16", "bf16"]: + d = factory.get_hdim_tile_size_dict(dtype) + if d is None: + continue + for ((hdim, hdim_v), tiles), mode in itertools.product(d.items(), ["batch"]): + for tile, pipeline in itertools.product( + tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) + ): + if pipeline.tag != "qr_async_vsa": + continue + k = FmhaFwdKernel( + F_idx=1, + F_hdim=hdim, + F_dtype=dtype, + F_mode=mode, + F_tile=tile, + F_pipeline=pipeline, + mask_impl=mask_impl, + ) + if kernel_filter != "": + if not fnmatch.fnmatch(k.name, kernel_filter): + continue + if optdim_list != [-1]: + if hdim not in optdim_list: + continue + # 2 - Flash attention integration + if receipt in (2, 3): + cond = dtype in ["fp16", "bf16"] + cond &= pipeline.F_vlayout == "row" + if not cond: + continue + # PyTorch integration + elif receipt == 4: + cond = dtype in ["fp16", "bf16"] + cond &= pipeline.F_vlayout == "row" + cond &= mode == "batch" + cond &= pipeline.F_logits == "f" + if not cond: + continue + # Aiter(mha_fwd) integration + elif receipt == 100: + cond = dtype in ["fp16", "bf16"] + cond &= mode == "batch" + cond &= pipeline.F_vlayout == "row" + if not cond: + continue + # Aiter(mha_varlen_fwd) integration + elif receipt == 200: + cond = dtype in ["fp16", "bf16"] + cond &= mode == "group" + cond &= pipeline.F_vlayout == "row" + if not cond: + continue + # aiter::mha_fwd C++ api integration + elif receipt == 600: + cond = dtype in ["fp16", "bf16"] + cond &= pipeline.F_vlayout == "row" + if not cond: + continue + + api_pool.register_traits(k.api_trait()) + gen.append(k) + + return (api_pool, gen) + + +def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None: + update_file(autogen_dir / kernel.filename, kernel.template) + + +def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None: + update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api) + + +def write_blobs( + output_dir: Path, kernel_filter: str, receipt, optdim_list, mask_impl +) -> None: + api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl) + for kernel in kernels: + write_single_fwd_kernel(kernel, output_dir) + write_fwd_api(api_pool, output_dir) + + +def list_blobs( + file_path: Path, kernel_filter: str, receipt, optdim_list, mask_impl +) -> None: + with file_path.open("a") as f: + _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl) + for kernel in kernels: + f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n") + f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n") diff --git a/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp b/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp index 7349c3576e8..25e3513d2fa 100644 --- a/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp +++ b/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp @@ -277,6 +277,9 @@ struct fmha_jenga_fwd_traits float fmha_jenga_fwd(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::stream_config&); +// sparge jenga +float sparge_jenga_fwd(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::stream_config&); + template float fmha_jenga_fwd_(const ck_tile::stream_config&, fmha_jenga_fwd_args); @@ -322,6 +325,9 @@ using fmha_vsa_fwd_traits = fmha_jenga_fwd_traits; float fmha_vsa_fwd(fmha_vsa_fwd_traits, fmha_vsa_fwd_args, const ck_tile::stream_config&); +// sparge vsa +float sparge_vsa_fwd(fmha_vsa_fwd_traits, fmha_vsa_fwd_args, const ck_tile::stream_config&); + template float fmha_vsa_fwd_(const ck_tile::stream_config&, fmha_vsa_fwd_args); diff --git a/example/ck_tile/50_sparse_attn/jenga_sparge_attention.cpp b/example/ck_tile/50_sparse_attn/jenga_sparge_attention.cpp new file mode 100644 index 00000000000..88f3e08204e --- /dev/null +++ b/example/ck_tile/50_sparse_attn/jenga_sparge_attention.cpp @@ -0,0 +1,189 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#include "jenga_sparge_attention.h" +#include "fmha_fwd_trek.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/device_memory.hpp" +#include + +template +ck_tile::HostTensor +jenga_sparge_attention(const ck_tile::HostTensor& TQ, + const ck_tile::HostTensor& TK, + const ck_tile::HostTensor& TV, + const ck_tile::HostTensor& Tblock_relation_onehot, + ck_tile::HostTensor& Y, + int batch, + int nhead, + int nhead_k, + int seqlen_q, + int seqlen_k, + int hdim_q, + int hdim_v, + bool i_perm, + bool o_perm, + int max_seqlen_q, + int max_seqlen_k, + int log_level) +{ + static_assert(std::is_same_v || + std::is_same_v, + "Jenga sparse attention supports fp16/bf16 only."); + std::string data_type = "fp16"; + if constexpr(std::is_same_v) + { + data_type = "bf16"; + } + + if(max_seqlen_q == 0) + max_seqlen_q = seqlen_q; + if(max_seqlen_k == 0) + max_seqlen_k = seqlen_k; + bool is_v_rowmajor = true; + float scale_s = 1.0 / ck_tile::sqrt(static_cast(hdim_q)); + std::string msk_str = "0"; + mask_info mask = mask_info::decode(msk_str, seqlen_q, seqlen_k); + + const ck_tile::index_t shape_seqlen_q = seqlen_q; + const ck_tile::index_t shape_seqlen_k = seqlen_k; + + ck_tile::stream_config stream_config{nullptr, + false, // time_kernel + log_level, + 0, + 1, + false}; + + ck_tile::DeviceMem q_buf(TQ.get_element_space_size_in_bytes()); + ck_tile::DeviceMem k_buf(TK.get_element_space_size_in_bytes()); + ck_tile::DeviceMem v_buf(TV.get_element_space_size_in_bytes()); + ck_tile::DeviceMem block_relation_buf(Tblock_relation_onehot.get_element_space_size_in_bytes()); + ck_tile::DeviceMem o_buf(Y.get_element_space_size_in_bytes()); + + q_buf.ToDevice(TQ.data()); + k_buf.ToDevice(TK.data()); + v_buf.ToDevice(TV.data()); + block_relation_buf.ToDevice(Tblock_relation_onehot.data()); + + const auto init_args = [&](auto& args) { + assert(nhead % nhead_k == 0); + const ck_tile::index_t stride_q = (i_perm ? hdim_q : nhead * hdim_q); + const ck_tile::index_t stride_k = (i_perm ? hdim_q : nhead_k * hdim_q); + const ck_tile::index_t stride_v = [&]() { + if(is_v_rowmajor) + return i_perm ? hdim_v : nhead_k * hdim_v; + else + return (i_perm ? shape_seqlen_k : nhead_k * shape_seqlen_k); + }(); + const ck_tile::index_t stride_o = (o_perm ? hdim_v : nhead * hdim_v); + const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q); + const ck_tile::index_t nhead_stride_k = i_perm ? shape_seqlen_k * hdim_q : hdim_q; + const ck_tile::index_t nhead_stride_v = [&]() { + if(is_v_rowmajor) + return i_perm ? shape_seqlen_k * hdim_v : hdim_v; + else + return i_perm ? hdim_v * shape_seqlen_k : shape_seqlen_k; + }(); + const ck_tile::index_t nhead_stride_o = (o_perm ? shape_seqlen_q * hdim_v : hdim_v); + const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q); + const ck_tile::index_t batch_stride_k = nhead_k * shape_seqlen_k * hdim_q; + const ck_tile::index_t batch_stride_v = nhead_k * hdim_v * shape_seqlen_k; + const ck_tile::index_t batch_stride_o = (nhead * shape_seqlen_q * hdim_v); + + args.q_ptr = q_buf.GetDeviceBuffer(); + args.k_ptr = k_buf.GetDeviceBuffer(); + args.v_ptr = v_buf.GetDeviceBuffer(); + args.block_relation_onehot_ptr = block_relation_buf.GetDeviceBuffer(); + + args.batch = batch; + args.seqlen_q = shape_seqlen_q; + args.hdim_q = hdim_q; + args.hdim_v = hdim_v; + args.nhead_q = nhead; + args.nhead_k = nhead_k; + + args.stride_q = stride_q; + args.stride_k = stride_k; + args.stride_v = stride_v; + args.nhead_stride_q = nhead_stride_q; + args.nhead_stride_k = nhead_stride_k; + args.nhead_stride_v = nhead_stride_v; + args.batch_stride_q = batch_stride_q; + args.batch_stride_k = batch_stride_k; + args.batch_stride_v = batch_stride_v; + + args.o_ptr = o_buf.GetDeviceBuffer(); + + args.seqlen_k = shape_seqlen_k; + args.max_seqlen_q = max_seqlen_q; + + args.scale_s = scale_s; + + args.stride_o = stride_o; + args.nhead_stride_o = nhead_stride_o; + args.batch_stride_o = batch_stride_o; + + args.window_size_left = mask.left; + args.window_size_right = mask.right; + args.mask_type = static_cast(mask.type); + }; + + const auto init_traits = [&](auto& traits) { + traits.hdim_q = hdim_q; + traits.hdim_v = hdim_v; + traits.data_type = data_type; + traits.is_v_rowmajor = is_v_rowmajor; + traits.mask_type = mask.type; + }; + + fmha_jenga_fwd_traits fmha_traits; + init_traits(fmha_traits); + + fmha_jenga_fwd_args args; + init_args(args); + + sparge_jenga_fwd(fmha_traits, args, stream_config); + + o_buf.FromDevice(Y.data(), Y.get_element_space_size_in_bytes()); + + return Y; +} + +template ck_tile::HostTensor +jenga_sparge_attention(const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + ck_tile::HostTensor&, + int, + int, + int, + int, + int, + int, + int, + bool, + bool, + int, + int, + int); + +template ck_tile::HostTensor +jenga_sparge_attention(const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + ck_tile::HostTensor&, + int, + int, + int, + int, + int, + int, + int, + bool, + bool, + int, + int, + int); diff --git a/example/ck_tile/50_sparse_attn/jenga_sparge_attention.h b/example/ck_tile/50_sparse_attn/jenga_sparge_attention.h new file mode 100644 index 00000000000..6259fcc73cf --- /dev/null +++ b/example/ck_tile/50_sparse_attn/jenga_sparge_attention.h @@ -0,0 +1,27 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once +#include +#include +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" + +template +ck_tile::HostTensor +jenga_sparge_attention(const ck_tile::HostTensor& TQ, + const ck_tile::HostTensor& TK, + const ck_tile::HostTensor& TV, + const ck_tile::HostTensor& Tblock_relation_onehot, + ck_tile::HostTensor& Y, + int batch, + int nhead, + int nhead_k, + int seqlen_q, + int seqlen_k, + int hdim_q, + int hdim_v, + bool i_perm, + bool o_perm, + int max_seqlen_q, + int max_seqlen_k, + int log_level = 0); diff --git a/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp b/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp index 0bd664adf68..590e51db144 100644 --- a/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp +++ b/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp @@ -16,7 +16,7 @@ #include "ck_tile/host/reference/reference_blocked_attention.hpp" #include "ck_tile/core/utility/bit_cast.hpp" -#include "jenga_sparse_attention.h" +#include "jenga_sparge_attention.h" #include "sparge_tool.hpp" // ============================================================================ @@ -115,7 +115,7 @@ auto create_args(int argc, char* argv[]) .insert("repeat", "20", "benchmark iterations") .insert("kname", "0", "print kernel name") // Sparge-specific - .insert("blkq", "128", "Sparge BLKQ") + .insert("blkq", "64", "Sparge BLKQ") .insert("blkk", "128", "Sparge BLKK") .insert("simthreshd1", "0.6", "Sparge sim threshold") .insert("cdfthreshd", "0.98", "Sparge CDF threshold (used when topk < 0)") @@ -161,10 +161,10 @@ bool run_test(const ck_tile::ArgParser& arg_parser) if(hdim_v < 0) hdim_v = hdim_q; - if(blkq != 128 || blkk != 128 || hdim_q != 128 || hdim_v != 128) + if(blkq != 64 || blkk != 128 || hdim_q != 128 || hdim_v != 128) { std::cout << "\n>>> TEST SKIPPED <<<" << std::endl; - std::cout << "Jenga/VSA kernel instances are generated for BLKQ=BLKK=128, " + std::cout << "Sparge Jenga kernel instances are generated for BLKQ=64, BLKK=128, " "hdim_q=128, hdim_v=128 only." << std::endl; std::cout << "TEST SKIPPED" << std::endl; @@ -247,7 +247,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) { if(kname) { - jenga_sparse_attention(q_host, + jenga_sparge_attention(q_host, k_host, v_host, block_relation_onehot, @@ -268,7 +268,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) for(int i = 0; i < warmup; ++i) { - jenga_sparse_attention(q_host, + jenga_sparge_attention(q_host, k_host, v_host, block_relation_onehot, @@ -292,7 +292,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) for(int i = 0; i < repeat; ++i) { - jenga_sparse_attention(q_host, + jenga_sparge_attention(q_host, k_host, v_host, block_relation_onehot, diff --git a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp index dd1d3e60bee..c0feb23e581 100644 --- a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp +++ b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp @@ -16,7 +16,7 @@ #include "ck_tile/host/reference/reference_blocked_attention.hpp" #include "ck_tile/core/utility/bit_cast.hpp" -#include "jenga_sparse_attention.h" +#include "vsa_sparge_attention.h" #include "sparge_tool.hpp" // ============================================================================ @@ -115,7 +115,7 @@ auto create_args(int argc, char* argv[]) .insert("repeat", "20", "benchmark iterations") .insert("kname", "0", "print kernel name") // Sparge-specific - .insert("blkq", "128", "Sparge BLKQ") + .insert("blkq", "64", "Sparge BLKQ") .insert("blkk", "128", "Sparge BLKK") .insert("simthreshd1", "0.6", "Sparge sim threshold") .insert("cdfthreshd", "0.98", "Sparge CDF threshold (used when topk < 0)") @@ -161,10 +161,10 @@ bool run_test(const ck_tile::ArgParser& arg_parser) if(hdim_v < 0) hdim_v = hdim_q; - if(blkq != 128 || blkk != 128 || hdim_q != 128 || hdim_v != 128) + if(blkq != 64 || blkk != 128 || hdim_q != 128 || hdim_v != 128) { std::cout << "\n>>> TEST SKIPPED <<<" << std::endl; - std::cout << "VSA kernel instances are generated for BLKQ=BLKK=128, " + std::cout << "Sparge VSA kernel instances are generated for BLKQ=64, BLKK=128, " "hdim_q=128, hdim_v=128 only." << std::endl; std::cout << "TEST SKIPPED" << std::endl; @@ -251,7 +251,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) { if(kname) { - vsa_sparse_attention(q_host, + vsa_sparge_attention(q_host, k_host, v_host, vsa_lut.lut, @@ -273,7 +273,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) for(int i = 0; i < warmup; ++i) { - vsa_sparse_attention(q_host, + vsa_sparge_attention(q_host, k_host, v_host, vsa_lut.lut, @@ -298,7 +298,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) for(int i = 0; i < repeat; ++i) { - vsa_sparse_attention(q_host, + vsa_sparge_attention(q_host, k_host, v_host, vsa_lut.lut, diff --git a/example/ck_tile/50_sparse_attn/vsa_sparge_attention.cpp b/example/ck_tile/50_sparse_attn/vsa_sparge_attention.cpp new file mode 100644 index 00000000000..5f9c2676ddb --- /dev/null +++ b/example/ck_tile/50_sparse_attn/vsa_sparge_attention.cpp @@ -0,0 +1,195 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#include "vsa_sparge_attention.h" +#include "fmha_fwd_trek.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/device_memory.hpp" +#include + +template +ck_tile::HostTensor +vsa_sparge_attention(const ck_tile::HostTensor& TQ, + const ck_tile::HostTensor& TK, + const ck_tile::HostTensor& TV, + const ck_tile::HostTensor& TKV_block_idx, + const ck_tile::HostTensor& TKV_blocks, + ck_tile::HostTensor& Y, + int batch, + int nhead, + int nhead_k, + int seqlen_q, + int seqlen_k, + int hdim_q, + int hdim_v, + bool i_perm, + bool o_perm, + int max_seqlen_q, + int max_seqlen_k, + int log_level) +{ + static_assert(std::is_same_v || + std::is_same_v, + "VSA sparse attention supports fp16/bf16 only."); + std::string data_type = "fp16"; + if constexpr(std::is_same_v) + { + data_type = "bf16"; + } + + if(max_seqlen_q == 0) + max_seqlen_q = seqlen_q; + if(max_seqlen_k == 0) + max_seqlen_k = seqlen_k; + bool is_v_rowmajor = true; + float scale_s = 1.0 / ck_tile::sqrt(static_cast(hdim_q)); + std::string msk_str = "0"; + mask_info mask = mask_info::decode(msk_str, seqlen_q, seqlen_k); + + const ck_tile::index_t shape_seqlen_q = seqlen_q; + const ck_tile::index_t shape_seqlen_k = seqlen_k; + + ck_tile::stream_config stream_config{nullptr, + false, // time_kernel + log_level, + 0, + 1, + false}; + + ck_tile::DeviceMem q_buf(TQ.get_element_space_size_in_bytes()); + ck_tile::DeviceMem k_buf(TK.get_element_space_size_in_bytes()); + ck_tile::DeviceMem v_buf(TV.get_element_space_size_in_bytes()); + ck_tile::DeviceMem lut_buf(TKV_block_idx.get_element_space_size_in_bytes()); + ck_tile::DeviceMem valid_block_num_buf(TKV_blocks.get_element_space_size_in_bytes()); + ck_tile::DeviceMem o_buf(Y.get_element_space_size_in_bytes()); + + q_buf.ToDevice(TQ.data()); + k_buf.ToDevice(TK.data()); + v_buf.ToDevice(TV.data()); + lut_buf.ToDevice(TKV_block_idx.data()); + valid_block_num_buf.ToDevice(TKV_blocks.data()); + + const auto init_args = [&](auto& args) { + assert(nhead % nhead_k == 0); + const ck_tile::index_t stride_q = (i_perm ? hdim_q : nhead * hdim_q); + const ck_tile::index_t stride_k = (i_perm ? hdim_q : nhead_k * hdim_q); + const ck_tile::index_t stride_v = [&]() { + if(is_v_rowmajor) + return i_perm ? hdim_v : nhead_k * hdim_v; + else + return (i_perm ? shape_seqlen_k : nhead_k * shape_seqlen_k); + }(); + const ck_tile::index_t stride_o = (o_perm ? hdim_v : nhead * hdim_v); + const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q); + const ck_tile::index_t nhead_stride_k = i_perm ? shape_seqlen_k * hdim_q : hdim_q; + const ck_tile::index_t nhead_stride_v = [&]() { + if(is_v_rowmajor) + return i_perm ? shape_seqlen_k * hdim_v : hdim_v; + else + return i_perm ? hdim_v * shape_seqlen_k : shape_seqlen_k; + }(); + const ck_tile::index_t nhead_stride_o = (o_perm ? shape_seqlen_q * hdim_v : hdim_v); + const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q); + const ck_tile::index_t batch_stride_k = nhead_k * shape_seqlen_k * hdim_q; + const ck_tile::index_t batch_stride_v = nhead_k * hdim_v * shape_seqlen_k; + const ck_tile::index_t batch_stride_o = (nhead * shape_seqlen_q * hdim_v); + + args.q_ptr = q_buf.GetDeviceBuffer(); + args.k_ptr = k_buf.GetDeviceBuffer(); + args.v_ptr = v_buf.GetDeviceBuffer(); + args.lut_ptr = lut_buf.GetDeviceBuffer(); + args.valid_block_num_ptr = valid_block_num_buf.GetDeviceBuffer(); + + args.batch = batch; + args.seqlen_q = shape_seqlen_q; + args.hdim_q = hdim_q; + args.hdim_v = hdim_v; + args.nhead_q = nhead; + args.nhead_k = nhead_k; + + args.stride_q = stride_q; + args.stride_k = stride_k; + args.stride_v = stride_v; + args.nhead_stride_q = nhead_stride_q; + args.nhead_stride_k = nhead_stride_k; + args.nhead_stride_v = nhead_stride_v; + args.batch_stride_q = batch_stride_q; + args.batch_stride_k = batch_stride_k; + args.batch_stride_v = batch_stride_v; + + args.o_ptr = o_buf.GetDeviceBuffer(); + + args.seqlen_k = shape_seqlen_k; + args.max_seqlen_q = max_seqlen_q; + + args.scale_s = scale_s; + + args.stride_o = stride_o; + args.nhead_stride_o = nhead_stride_o; + args.batch_stride_o = batch_stride_o; + + args.window_size_left = mask.left; + args.window_size_right = mask.right; + args.mask_type = static_cast(mask.type); + }; + + const auto init_traits = [&](auto& traits) { + traits.hdim_q = hdim_q; + traits.hdim_v = hdim_v; + traits.data_type = data_type; + traits.is_v_rowmajor = is_v_rowmajor; + traits.mask_type = mask.type; + }; + + fmha_vsa_fwd_traits fmha_traits; + init_traits(fmha_traits); + + fmha_vsa_fwd_args args; + init_args(args); + + sparge_vsa_fwd(fmha_traits, args, stream_config); + + o_buf.FromDevice(Y.data(), Y.get_element_space_size_in_bytes()); + + return Y; +} + +template ck_tile::HostTensor +vsa_sparge_attention(const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + ck_tile::HostTensor&, + int, + int, + int, + int, + int, + int, + int, + bool, + bool, + int, + int, + int); + +template ck_tile::HostTensor +vsa_sparge_attention(const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + ck_tile::HostTensor&, + int, + int, + int, + int, + int, + int, + int, + bool, + bool, + int, + int, + int); diff --git a/example/ck_tile/50_sparse_attn/vsa_sparge_attention.h b/example/ck_tile/50_sparse_attn/vsa_sparge_attention.h new file mode 100644 index 00000000000..d51a7e8c00b --- /dev/null +++ b/example/ck_tile/50_sparse_attn/vsa_sparge_attention.h @@ -0,0 +1,28 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once +#include +#include +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" + +template +ck_tile::HostTensor +vsa_sparge_attention(const ck_tile::HostTensor& TQ, + const ck_tile::HostTensor& TK, + const ck_tile::HostTensor& TV, + const ck_tile::HostTensor& TKV_block_idx, + const ck_tile::HostTensor& TKV_blocks, + ck_tile::HostTensor& Y, + int batch, + int nhead, + int nhead_k, + int seqlen_q, + int seqlen_k, + int hdim_q, + int hdim_v, + bool i_perm, + bool o_perm, + int max_seqlen_q, + int max_seqlen_k, + int log_level = 0); diff --git a/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp b/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp index 2b097ae5827..578ad7e6039 100644 --- a/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp +++ b/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp @@ -200,7 +200,7 @@ struct BlockFmhaPipelineQRKSVSAsyncVSA constexpr auto gemm_0 = Policy::template GetQKBlockGemm(); constexpr auto gemm_1 = Policy::template GetKVBlockGemm(); - int seqlen_k_start = kv_block_idx_ptr[0] * kM0; + int seqlen_k_start = kv_block_idx_ptr[0] * kN0; auto q_dram_window = make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(), q_dram_block_window_tmp.get_window_lengths(), q_dram_block_window_tmp.get_window_origin(), From d1d457b82a63fdf6e68461194fa2c1098ace5f93 Mon Sep 17 00:00:00 2001 From: Gino Lu Date: Mon, 13 Apr 2026 03:34:08 -0400 Subject: [PATCH 3/7] Add sparge gpu pipeline in tile_example_sparge_vsa_sparse_attn --- example/ck_tile/50_sparse_attn/CMakeLists.txt | 34 +- .../50_sparse_attn/sparge_blockmap.cpp | 156 ++++++ .../ck_tile/50_sparse_attn/sparge_blockmap.h | 26 + .../50_sparse_attn/sparge_blockmap_inst.cpp | 88 +++ .../50_sparse_attn/sparge_blockmap_trek.hpp | 93 ++++ .../test_sparge_vsa_sparse_attn.cpp | 234 ++++++-- .../kernel/sparge_blockmap_kernel.hpp | 195 +++++++ .../pipeline/sparge_blockmap_pipeline.hpp | 521 ++++++++++++++++++ 8 files changed, 1296 insertions(+), 51 deletions(-) create mode 100644 example/ck_tile/50_sparse_attn/sparge_blockmap.cpp create mode 100644 example/ck_tile/50_sparse_attn/sparge_blockmap.h create mode 100644 example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp create mode 100644 example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp create mode 100644 include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp create mode 100644 include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp diff --git a/example/ck_tile/50_sparse_attn/CMakeLists.txt b/example/ck_tile/50_sparse_attn/CMakeLists.txt index 0ac86f6affa..169ed87ac3b 100644 --- a/example/ck_tile/50_sparse_attn/CMakeLists.txt +++ b/example/ck_tile/50_sparse_attn/CMakeLists.txt @@ -266,11 +266,41 @@ target_compile_options(${SPARGE_VSA_INSTANCES} PRIVATE -Wno-float-equal ) -# Sparge + VSA Example executable +# ============================================================================ +# Sparge BlockMap GPU Kernel (hand-written instantiation, no codegen) +# ============================================================================ +set(SPARGE_BLOCKMAP_INSTANCES "tile_sparge_blockmap_instances") + +add_library(${SPARGE_BLOCKMAP_INSTANCES} OBJECT EXCLUDE_FROM_ALL + ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap_inst.cpp + ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap.cpp +) +target_include_directories(${SPARGE_BLOCKMAP_INSTANCES} PRIVATE + ${CMAKE_CURRENT_LIST_DIR} + ${PROJECT_SOURCE_DIR}/include/ck_tile/ops/sparse_attn +) +set_source_files_properties( + ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap_inst.cpp + ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap.cpp + PROPERTIES LANGUAGE HIP +) +set_property(TARGET ${SPARGE_BLOCKMAP_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS}) + +target_compile_options(${SPARGE_BLOCKMAP_INSTANCES} PRIVATE + -DCK_TILE_USE_BUFFER_ADDRESSING_BUILTIN + -DCK_TILE_FMHA_FWD_FAST_EXP2 + -Wno-undefined-func-template + -Wno-float-equal +) + +# Sparge + VSA Example executable (now links blockmap kernel too) set(EXAMPLE_SPARGE_VSA_SPARSE_ATTN "tile_example_sparge_vsa_sparse_attn") message(DEBUG "adding example ${EXAMPLE_SPARGE_VSA_SPARSE_ATTN}") add_executable(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} EXCLUDE_FROM_ALL test_sparge_vsa_sparse_attn.cpp) -target_link_libraries(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} ${SPARGE_VSA_INSTANCES}) +target_link_libraries(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} + ${SPARGE_VSA_INSTANCES} + ${SPARGE_BLOCKMAP_INSTANCES} +) target_include_directories(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_compile_options(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} PRIVATE -Wno-undefined-func-template diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap.cpp b/example/ck_tile/50_sparse_attn/sparge_blockmap.cpp new file mode 100644 index 00000000000..b9ac56c533c --- /dev/null +++ b/example/ck_tile/50_sparse_attn/sparge_blockmap.cpp @@ -0,0 +1,156 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#include "sparge_blockmap.h" +#include "sparge_blockmap_trek.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/device_memory.hpp" +#include +#include + +template +sparge::VSALut sparge_blockmap_gpu(const ck_tile::HostTensor& TQ, + const ck_tile::HostTensor& TK, + ck_tile::HostTensor& block_map_out, + int batch, + int nhead_q, + int nhead_k, + int seqlen_q, + int seqlen_k, + int hdim_q, + bool i_perm, + float simthreshd1, + float cdfthreshd, + float topk, + int blkq, + int blkk, + int log_level) +{ + static_assert(std::is_same_v || + std::is_same_v, + "sparge_blockmap_gpu supports fp16/bf16 only."); + + std::string data_type = "fp16"; + if constexpr(std::is_same_v) + { + data_type = "bf16"; + } + + const ck_tile::index_t num_q_blocks = ck_tile::integer_divide_ceil(seqlen_q, blkq); + const ck_tile::index_t num_k_blocks = ck_tile::integer_divide_ceil(seqlen_k, blkk); + + const float scale = 1.0f / std::sqrt(static_cast(hdim_q)); + + // Allocate device memory + ck_tile::DeviceMem q_buf(TQ.get_element_space_size_in_bytes()); + ck_tile::DeviceMem k_buf(TK.get_element_space_size_in_bytes()); + + const std::size_t bmap_bytes = + static_cast(batch) * nhead_q * num_q_blocks * num_k_blocks * sizeof(uint8_t); + const std::size_t lut_bytes = + static_cast(batch) * nhead_q * num_q_blocks * num_k_blocks * sizeof(int32_t); + const std::size_t valid_bytes = + static_cast(batch) * nhead_q * num_q_blocks * sizeof(int32_t); + + ck_tile::DeviceMem bmap_buf(bmap_bytes); + ck_tile::DeviceMem lut_buf(lut_bytes); + ck_tile::DeviceMem valid_buf(valid_bytes); + + q_buf.ToDevice(TQ.data()); + k_buf.ToDevice(TK.data()); + bmap_buf.SetZero(); + lut_buf.SetZero(); + valid_buf.SetZero(); + + // Compute strides (assumes BHSD if i_perm, BSHD otherwise) + const ck_tile::index_t stride_q = i_perm ? hdim_q : nhead_q * hdim_q; + const ck_tile::index_t stride_k = i_perm ? hdim_q : nhead_k * hdim_q; + const ck_tile::index_t nhead_stride_q = + i_perm ? static_cast(seqlen_q) * hdim_q : hdim_q; + const ck_tile::index_t nhead_stride_k = + i_perm ? static_cast(seqlen_k) * hdim_q : hdim_q; + const ck_tile::index_t batch_stride_q = + static_cast(nhead_q) * seqlen_q * hdim_q; + const ck_tile::index_t batch_stride_k = + static_cast(nhead_k) * seqlen_k * hdim_q; + + ck_tile::stream_config stream_config{nullptr, false, log_level, 0, 1, false}; + + sparge_blockmap_args args; + args.q_ptr = q_buf.GetDeviceBuffer(); + args.k_ptr = k_buf.GetDeviceBuffer(); + args.batch = batch; + args.seqlen_q = seqlen_q; + args.seqlen_k = seqlen_k; + args.hdim_q = hdim_q; + args.nhead_q = nhead_q; + args.nhead_k = nhead_k; + args.stride_q = stride_q; + args.stride_k = stride_k; + args.nhead_stride_q = nhead_stride_q; + args.nhead_stride_k = nhead_stride_k; + args.batch_stride_q = batch_stride_q; + args.batch_stride_k = batch_stride_k; + args.simthreshd1 = simthreshd1; + args.cdfthreshd = cdfthreshd; + args.topk = topk; + args.scale = scale; + args.block_map_ptr = bmap_buf.GetDeviceBuffer(); + args.lut_ptr = lut_buf.GetDeviceBuffer(); + args.valid_block_num_ptr = valid_buf.GetDeviceBuffer(); + + sparge_blockmap_traits traits; + traits.data_type = data_type; + traits.hdim_q = hdim_q; + + sparge_blockmap_fwd(traits, args, stream_config); + + // Copy results back to host + bmap_buf.FromDevice(block_map_out.data(), bmap_bytes); + + sparge::VSALut vsa_lut{ + ck_tile::HostTensor({batch, nhead_q, num_q_blocks, num_k_blocks}), + ck_tile::HostTensor({batch, nhead_q, num_q_blocks}), + }; + lut_buf.FromDevice(vsa_lut.lut.data(), lut_bytes); + valid_buf.FromDevice(vsa_lut.valid_block_num.data(), valid_bytes); + + return vsa_lut; +} + +// Explicit template instantiations +template sparge::VSALut +sparge_blockmap_gpu(const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + ck_tile::HostTensor&, + int, + int, + int, + int, + int, + int, + bool, + float, + float, + float, + int, + int, + int); + +template sparge::VSALut +sparge_blockmap_gpu(const ck_tile::HostTensor&, + const ck_tile::HostTensor&, + ck_tile::HostTensor&, + int, + int, + int, + int, + int, + int, + bool, + float, + float, + float, + int, + int, + int); diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap.h b/example/ck_tile/50_sparse_attn/sparge_blockmap.h new file mode 100644 index 00000000000..3057257ca14 --- /dev/null +++ b/example/ck_tile/50_sparse_attn/sparge_blockmap.h @@ -0,0 +1,26 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once + +#include +#include "ck_tile/core.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include "sparge_tool.hpp" + +template +sparge::VSALut sparge_blockmap_gpu(const ck_tile::HostTensor& TQ, + const ck_tile::HostTensor& TK, + ck_tile::HostTensor& block_map_out, + int batch, + int nhead_q, + int nhead_k, + int seqlen_q, + int seqlen_k, + int hdim_q, + bool i_perm, + float simthreshd1, + float cdfthreshd, + float topk, + int blkq, + int blkk, + int log_level = 0); diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp b/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp new file mode 100644 index 00000000000..fbd18b9ff24 --- /dev/null +++ b/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp @@ -0,0 +1,88 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +// Hand-written template instantiation for SpargeBlockMapKernel (fp16, D=128). + +#include "sparge_blockmap_trek.hpp" +#include "ck_tile/ops/fmha/block/variants.hpp" + +#include + +// ============================================================================ +// Type configuration for block map kernel (reuses FmhaSparseFwdTypeConfig) +// ============================================================================ + +// fp16: D=128, kM0=64, kN0=128 +using bmap_fp16_block_tile = ck_tile::sequence<64, 128, 128, 128, 128, 128>; +// kM0 kN0 kK0 kN1 kK1 kQKHeaddim(D) + +using bmap_fp16_shape = + ck_tile::TileFmhaShape, // Gemm0BlockWarps + ck_tile::sequence<16, 16, 16>, // Gemm0WarpTile (unused by blockmap, but + // needed by shape) + ck_tile::sequence<4, 1, 1>, // Gemm1BlockWarps + ck_tile::sequence<16, 16, 16>, // Gemm1WarpTile + true>; // VLayout row-major + +using bmap_fp16_trait = ck_tile::TileFmhaTraits; // kIsVRowMajorSkip + +using bmap_fp16_variant = ck_tile::ComposedAttention<0, CK_TILE_FMHA_FWD_FAST_EXP2>; +using bmap_fp16_mask = ck_tile::GenericAttentionMask; + +using bmap_fp16_problem = ck_tile::BlockFmhaPipelineProblem; + +using bmap_fp16_pipeline = ck_tile::SpargeBlockMapPipeline; +using bmap_fp16_kernel = ck_tile::SpargeBlockMapKernel; + +// ============================================================================ +// Dispatch +// ============================================================================ + +float sparge_blockmap_fwd(sparge_blockmap_traits traits, + sparge_blockmap_args args, + const ck_tile::stream_config& s) +{ + if(traits.data_type == "fp16" && traits.hdim_q == 128) + { + using k_ = bmap_fp16_kernel; + if(s.log_level_ > 0) + std::cout << ", sparge_blockmap_fp16_d128" << std::flush; + auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids(args); + const dim3 blocks = k_::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)); + } + + if(s.log_level_ > 0) + std::cerr << "sparge_blockmap_fwd: unsupported config (data_type=" << traits.data_type + << ", hdim_q=" << traits.hdim_q << ")" << std::endl; + return -1.f; +} diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp b/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp new file mode 100644 index 00000000000..1e7e33248a2 --- /dev/null +++ b/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp @@ -0,0 +1,93 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp" +#include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp" +#include "ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp" +#include "ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp" + +#include "fmha_fwd_trek.hpp" + +#include +#include + +// ============================================================================ +// Args and traits for sparge block map GPU kernel +// ============================================================================ +struct sparge_blockmap_args +{ + const void* q_ptr; + const void* k_ptr; + + ck_tile::index_t batch; + ck_tile::index_t seqlen_q; + ck_tile::index_t seqlen_k; + ck_tile::index_t hdim_q; + ck_tile::index_t nhead_q; + ck_tile::index_t nhead_k; + + ck_tile::index_t stride_q; + ck_tile::index_t stride_k; + ck_tile::index_t nhead_stride_q; + ck_tile::index_t nhead_stride_k; + ck_tile::index_t batch_stride_q; + ck_tile::index_t batch_stride_k; + + float simthreshd1; + float cdfthreshd; + float topk; + float scale; + + void* block_map_ptr; + void* lut_ptr; + void* valid_block_num_ptr; +}; + +struct sparge_blockmap_traits +{ + std::string data_type; + int hdim_q; +}; + +// ============================================================================ +// Create kernel args and grid dimensions +// ============================================================================ +template +auto sparge_blockmap_create_kargs_and_grids(sparge_blockmap_args args) +{ + assert(args.nhead_q % args.nhead_k == 0); + auto kargs = BlockMapKernel::MakeKargs(args.q_ptr, + args.k_ptr, + args.seqlen_q, + args.seqlen_k, + args.hdim_q, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.stride_q, + args.stride_k, + args.nhead_stride_q, + args.nhead_stride_k, + args.batch_stride_q, + args.batch_stride_k, + args.simthreshd1, + args.cdfthreshd, + args.topk, + args.scale, + args.block_map_ptr, + args.lut_ptr, + args.valid_block_num_ptr); + + dim3 grids = BlockMapKernel::GridSize(args.batch, args.nhead_q, args.seqlen_q); + return ck_tile::make_tuple(kargs, grids); +} + +// ============================================================================ +// Hand-written template instantiation dispatch +// ============================================================================ +float sparge_blockmap_fwd(sparge_blockmap_traits traits, + sparge_blockmap_args args, + const ck_tile::stream_config& stream_config); diff --git a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp index c0feb23e581..638a867b0f3 100644 --- a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp +++ b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp @@ -17,6 +17,7 @@ #include "ck_tile/core/utility/bit_cast.hpp" #include "vsa_sparge_attention.h" +#include "sparge_blockmap.h" #include "sparge_tool.hpp" // ============================================================================ @@ -198,53 +199,37 @@ bool run_test(const ck_tile::ArgParser& arg_parser) ck_tile::HostTensor output_host = o_perm ? ck_tile::HostTensor({batch, nhead, seqlen_q, hdim_v}) : ck_tile::HostTensor({batch, seqlen_q, nhead, hdim_v}); - ck_tile::HostTensor output_ref({batch, nhead, seqlen_q, hdim_v}); std::cout << "\nInitializing tensors..." << std::endl; ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed}(q_host); ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 1}(k_host); ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 2}(v_host); - // Build block map using Sparge tool - std::cout << "Building Sparge block map..." << std::endl; - sparge::SpargeParams p; - p.BLKQ = static_cast(BLKQ); - p.BLKK = static_cast(BLKK); - p.simthreshd1 = simthreshd1; - p.cdfthreshd = cdfthreshd; - p.topk = topk; - p.i_perm = i_perm; - - ck_tile::HostTensor block_relation_onehot = - sparge::build_block_map_meansim(q_host, k_host, p); - - // Convert to VSA LUT (delta-encoded) + valid_block_num - std::cout << "Converting block map to VSA LUT (delta)..." << std::endl; - auto vsa_lut = sparge::block_map_to_vsa_lut_delta(block_relation_onehot); - - // Print actual sparsity (based on one-hot) - std::size_t total_blocks = 0; - std::size_t active_blocks = 0; - for(ck_tile::index_t b = 0; b < batch; ++b) - { - for(ck_tile::index_t h = 0; h < nhead; ++h) - { - for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) - { - for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) - { - total_blocks++; - if(block_relation_onehot(b, h, qb, kb) != 0) - active_blocks++; - } - } - } - } - float actual_sparsity = - 1.0f - static_cast(active_blocks) / static_cast(total_blocks); - std::cout << " Actual sparsity: " << actual_sparsity << " (" << active_blocks << "/" - << total_blocks << " blocks active)" << std::endl; - + // ================================================================== + // GPU: Build block map + VSA LUT in one kernel (always run) + // ================================================================== + std::cout << "Building Sparge block map + VSA LUT (GPU)..." << std::endl; + ck_tile::HostTensor block_map_gpu({batch, nhead, num_q_blocks, num_k_blocks}); + auto vsa_lut_gpu = sparge_blockmap_gpu(q_host, + k_host, + block_map_gpu, + batch, + nhead, + nhead_k, + seqlen_q, + seqlen_k, + hdim_q, + i_perm, + simthreshd1, + cdfthreshd, + topk, + static_cast(BLKQ), + static_cast(BLKK), + 0); + + // ================================================================== + // VSA sparse attention kernel (always run) + // ================================================================== std::cout << "\n--- Running VSA sparse attention kernel ---" << std::endl; try @@ -254,8 +239,8 @@ bool run_test(const ck_tile::ArgParser& arg_parser) vsa_sparge_attention(q_host, k_host, v_host, - vsa_lut.lut, - vsa_lut.valid_block_num, + vsa_lut_gpu.lut, + vsa_lut_gpu.valid_block_num, output_host, batch, nhead, @@ -276,8 +261,8 @@ bool run_test(const ck_tile::ArgParser& arg_parser) vsa_sparge_attention(q_host, k_host, v_host, - vsa_lut.lut, - vsa_lut.valid_block_num, + vsa_lut_gpu.lut, + vsa_lut_gpu.valid_block_num, output_host, batch, nhead, @@ -301,8 +286,8 @@ bool run_test(const ck_tile::ArgParser& arg_parser) vsa_sparge_attention(q_host, k_host, v_host, - vsa_lut.lut, - vsa_lut.valid_block_num, + vsa_lut_gpu.lut, + vsa_lut_gpu.valid_block_num, output_host, batch, nhead, @@ -332,17 +317,168 @@ bool run_test(const ck_tile::ArgParser& arg_parser) return false; } + // ================================================================== + // Sparsity statistics (always run, pure CPU read of HostTensor) + // ================================================================== + std::size_t total_blocks = 0; + std::size_t active_blocks = 0; + for(ck_tile::index_t b = 0; b < batch; ++b) + { + for(ck_tile::index_t h = 0; h < nhead; ++h) + { + for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) + { + for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) + { + total_blocks++; + if(block_map_gpu(b, h, qb, kb) != 0) + active_blocks++; + } + } + } + } + float actual_sparsity = + 1.0f - static_cast(active_blocks) / static_cast(total_blocks); + std::cout << "\n Actual sparsity: " << actual_sparsity << " (" << active_blocks << "/" + << total_blocks << " blocks active)" << std::endl; + + // ================================================================== + // Validation (only when -v=1) + // ================================================================== bool pass = true; if(do_validation) { std::cout << "\n--- Performing CPU validation ---" << std::endl; + + // CPU golden: block map + VSA LUT + std::cout << "Building Sparge block map (CPU golden)..." << std::endl; + sparge::SpargeParams p; + p.BLKQ = static_cast(BLKQ); + p.BLKK = static_cast(BLKK); + p.simthreshd1 = simthreshd1; + p.cdfthreshd = cdfthreshd; + p.topk = topk; + p.i_perm = i_perm; + + ck_tile::HostTensor block_relation_onehot = + sparge::build_block_map_meansim(q_host, k_host, p); + + std::cout << "Converting block map to VSA LUT (delta, CPU)..." << std::endl; + auto vsa_lut_cpu = sparge::block_map_to_vsa_lut_delta(block_relation_onehot); + + // Validate block map + std::cout << "\n--- Validating GPU block map vs CPU golden ---" << std::endl; + { + std::size_t bmap_mismatches = 0; + for(ck_tile::index_t b = 0; b < batch; ++b) + { + for(ck_tile::index_t h = 0; h < nhead; ++h) + { + for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) + { + for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) + { + if(block_map_gpu(b, h, qb, kb) != + block_relation_onehot(b, h, qb, kb)) + { + bmap_mismatches++; + if(bmap_mismatches <= 10) + { + std::cout + << " block_map mismatch at [" << b << "," << h << "," + << qb << "," << kb + << "]: GPU=" + << static_cast(block_map_gpu(b, h, qb, kb)) + << " CPU=" + << static_cast( + block_relation_onehot(b, h, qb, kb)) + << std::endl; + } + } + } + } + } + } + std::cout << " Block map mismatches: " << bmap_mismatches << " / " + << (batch * nhead * num_q_blocks * num_k_blocks) << std::endl; + if(bmap_mismatches > 0) + { + std::cout << ">>> GPU BLOCK MAP VALIDATION FAILED <<<" << std::endl; + pass = false; + } + else + { + std::cout << ">>> GPU BLOCK MAP VALIDATION PASSED <<<" << std::endl; + } + } + + // Validate VSA LUT + std::cout << "\n--- Validating GPU VSA LUT vs CPU golden ---" << std::endl; + { + std::size_t lut_mismatches = 0; + std::size_t valid_mismatches = 0; + for(ck_tile::index_t b = 0; b < batch; ++b) + { + for(ck_tile::index_t h = 0; h < nhead; ++h) + { + for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) + { + if(vsa_lut_gpu.valid_block_num(b, h, qb) != + vsa_lut_cpu.valid_block_num(b, h, qb)) + { + valid_mismatches++; + if(valid_mismatches <= 5) + { + std::cout + << " valid_block_num mismatch at [" << b << "," << h + << "," << qb + << "]: GPU=" << vsa_lut_gpu.valid_block_num(b, h, qb) + << " CPU=" << vsa_lut_cpu.valid_block_num(b, h, qb) + << std::endl; + } + } + for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) + { + if(vsa_lut_gpu.lut(b, h, qb, kb) != + vsa_lut_cpu.lut(b, h, qb, kb)) + { + lut_mismatches++; + if(lut_mismatches <= 10) + { + std::cout + << " LUT mismatch at [" << b << "," << h << "," << qb + << "," << kb + << "]: GPU=" << vsa_lut_gpu.lut(b, h, qb, kb) + << " CPU=" << vsa_lut_cpu.lut(b, h, qb, kb) + << std::endl; + } + } + } + } + } + } + std::cout << " LUT mismatches: " << lut_mismatches << std::endl; + std::cout << " valid_block_num mismatches: " << valid_mismatches << std::endl; + if(lut_mismatches == 0 && valid_mismatches == 0) + { + std::cout << ">>> GPU VSA LUT VALIDATION PASSED <<<" << std::endl; + } + else + { + std::cout << ">>> GPU VSA LUT VALIDATION FAILED <<<" << std::endl; + pass = false; + } + } + + // Validate attention output float scale = 1.0f / std::sqrt(static_cast(hdim_q)); - std::cout << "Computing reference output..." << std::endl; + std::cout << "\nComputing reference attention output..." << std::endl; auto q_ref = to_bhsd(q_host, i_perm); auto k_ref = to_bhsd(k_host, i_perm); auto v_ref = to_bhsd(v_host, i_perm); + ck_tile::HostTensor output_ref({batch, nhead, seqlen_q, hdim_v}); ck_tile::reference_blocked_attention( q_ref, k_ref, v_ref, block_relation_onehot, output_ref, BLKQ, BLKK, scale); @@ -374,7 +510,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) } } - std::cout << "\nValidation results:" << std::endl; + std::cout << "\nAttention validation results:" << std::endl; std::cout << " Max absolute difference: " << max_diff << std::endl; std::cout << " Max relative difference: " << max_rel_diff << std::endl; std::cout << " Number of mismatches: " << num_errors << " / " diff --git a/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp b/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp new file mode 100644 index 00000000000..ca177abf23a --- /dev/null +++ b/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp @@ -0,0 +1,195 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once + +#include "ck_tile/core.hpp" +#include + +namespace ck_tile { + +template +struct SpargeBlockMapKernel +{ + using Pipeline = remove_cvref_t; + + static constexpr index_t kBlockSize = Pipeline::kBlockSize; + static constexpr index_t kBlockPerCu = Pipeline::kBlockPerCu; + + using QDataType = typename Pipeline::QDataType; + using KDataType = typename Pipeline::KDataType; + + static constexpr index_t kM0 = Pipeline::kM0; + static constexpr index_t kN0 = Pipeline::kN0; + static constexpr index_t D = Pipeline::D; + + static constexpr index_t kAlignment = 16 / sizeof(QDataType); + + struct Kargs + { + const void* q_ptr; + const void* k_ptr; + + index_t seqlen_q; + index_t seqlen_k; + index_t hdim_q; + + index_t nhead_q; + index_t nhead_ratio_qk; + + index_t stride_q; + index_t stride_k; + index_t nhead_stride_q; + index_t nhead_stride_k; + index_t batch_stride_q; + index_t batch_stride_k; + + float simthreshd1; + float cdfthreshd; + float topk; + float scale; + + void* block_map_ptr; + void* lut_ptr; + void* valid_block_num_ptr; + + index_t N_k; + }; + + CK_TILE_HOST static constexpr auto MakeKargs(const void* q_ptr, + const void* k_ptr, + index_t seqlen_q, + index_t seqlen_k, + index_t hdim_q, + index_t nhead_q, + index_t nhead_ratio_qk, + index_t stride_q, + index_t stride_k, + index_t nhead_stride_q, + index_t nhead_stride_k, + index_t batch_stride_q, + index_t batch_stride_k, + float simthreshd1, + float cdfthreshd, + float topk, + float scale, + void* block_map_ptr, + void* lut_ptr, + void* valid_block_num_ptr) + { + const index_t N_k = integer_divide_ceil(seqlen_k, kN0); + return Kargs{q_ptr, + k_ptr, + seqlen_q, + seqlen_k, + hdim_q, + nhead_q, + nhead_ratio_qk, + stride_q, + stride_k, + nhead_stride_q, + nhead_stride_k, + batch_stride_q, + batch_stride_k, + simthreshd1, + cdfthreshd, + topk, + scale, + block_map_ptr, + lut_ptr, + valid_block_num_ptr, + N_k}; + } + + CK_TILE_HOST static constexpr auto GridSize(index_t batch, index_t nhead_q, index_t seqlen_q) + { + const index_t Q_blk = integer_divide_ceil(seqlen_q, kM0); + return dim3(Q_blk, nhead_q, batch); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const index_t qb = static_cast(blockIdx.x); + const index_t hq = static_cast(blockIdx.y); + const index_t b = static_cast(blockIdx.z); + + const index_t hk = hq / kargs.nhead_ratio_qk; + + // Q pointer for this (batch, head, q_block) + const auto* q_base = reinterpret_cast(kargs.q_ptr) + + b * kargs.batch_stride_q + hq * kargs.nhead_stride_q + + qb * kM0 * kargs.stride_q; + + // K pointer for this (batch, head_k) + const auto* k_base = reinterpret_cast(kargs.k_ptr) + + b * kargs.batch_stride_k + hk * kargs.nhead_stride_k; + + // Q DRAM view with OOB padding + const auto q_dram_naive = make_naive_tensor_view( + q_base, + make_tuple(kargs.seqlen_q - qb * kM0, D), + make_tuple(kargs.stride_q, 1), + number{}, + number<1>{}); + const auto q_dram = pad_tensor_view( + q_dram_naive, make_tuple(number{}, number{}), sequence{}); + + auto q_window = make_tile_window(q_dram, + make_tuple(number{}, number{}), + {0, 0}, + Pipeline::MakeQBlockDistribution()); + + // K DRAM view with OOB padding + const auto k_dram_naive = + make_naive_tensor_view(k_base, + make_tuple(kargs.seqlen_k, D), + make_tuple(kargs.stride_k, 1), + number{}, + number<1>{}); + const auto k_dram = pad_tensor_view( + k_dram_naive, make_tuple(number{}, number{}), sequence{}); + + auto k_window = make_tile_window(k_dram, + make_tuple(number{}, number{}), + {0, 0}, + Pipeline::MakeKBlockDistribution()); + + // Output pointers for this (batch, head, q_block) + const index_t N_k = kargs.N_k; + const index_t bmap_offset = + (b * kargs.nhead_q + hq) * integer_divide_ceil(kargs.seqlen_q, kM0) * N_k + qb * N_k; + auto* bmap_ptr = reinterpret_cast(kargs.block_map_ptr) + bmap_offset; + + int32_t* lut_out = nullptr; + int32_t* valid_out = nullptr; + if(kargs.lut_ptr != nullptr) + { + lut_out = reinterpret_cast(kargs.lut_ptr) + bmap_offset; + const index_t valid_offset = + (b * kargs.nhead_q + hq) * integer_divide_ceil(kargs.seqlen_q, kM0) + qb; + valid_out = reinterpret_cast(kargs.valid_block_num_ptr) + valid_offset; + } + + // Shared memory + __shared__ char smem[Pipeline::GetSmemSize()]; + + Pipeline{}(q_window, + k_window, + kargs.seqlen_q, + kargs.seqlen_k, + qb, + N_k, + kargs.nhead_ratio_qk, + kargs.simthreshd1, + kargs.cdfthreshd, + kargs.topk, + kargs.scale, + bmap_ptr, + lut_out, + valid_out, + static_cast(smem)); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp b/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp new file mode 100644 index 00000000000..222e73c60e2 --- /dev/null +++ b/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp @@ -0,0 +1,521 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/reduce.hpp" + +namespace ck_tile { + +template +struct SpargeBlockMapPipeline +{ + using Problem = remove_cvref_t; + using QDataType = remove_cvref_t; + using KDataType = remove_cvref_t; + using BlockFmhaShape = remove_cvref_t; + + static constexpr index_t kBlockSize = Problem::kBlockSize; + static constexpr index_t kM0 = BlockFmhaShape::kM0; + static constexpr index_t kN0 = BlockFmhaShape::kN0; + static constexpr index_t D = BlockFmhaShape::kQKHeaddim; + static constexpr index_t NumWarps = BlockFmhaShape::NumWarps; + static constexpr index_t WarpSize = get_warp_size(); + + static constexpr index_t KPerThread = 16 / sizeof(QDataType); + static constexpr index_t KThreads = D / KPerThread; + static constexpr index_t SeqThreadPerWarp = WarpSize / KThreads; + static constexpr index_t MPerThread = kM0 / (SeqThreadPerWarp * NumWarps); + static constexpr index_t NPerThread = kN0 / (SeqThreadPerWarp * NumWarps); + + static constexpr index_t kBlockPerCu = 1; + static constexpr index_t kMaxKBlocks = 1024; + + // LDS layout (non-overlapping, all used simultaneously in Phase 2): + // [0 .. kReduceBytes) cross-warp reduction scratch + // [kScoreOffset ..) scores[N_k] + // [kBmapOffset ..) block_map[N_k] + // [kSmallOffset ..) Phase 3 argmax scratch (2*NumWarps floats) + static constexpr index_t kReduceBytes = NumWarps * D * sizeof(float); + static constexpr index_t kScoreOffset = kReduceBytes; + static constexpr index_t kBmapOffset = kScoreOffset + kMaxKBlocks * sizeof(float); + static constexpr index_t kSmallOffset = kBmapOffset + kMaxKBlocks * sizeof(uint8_t); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + { + return kSmallOffset + 2 * NumWarps * sizeof(float); + } + + CK_TILE_HOST_DEVICE static constexpr auto MakeQBlockDistribution() + { + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + + CK_TILE_HOST_DEVICE static constexpr auto MakeKBlockDistribution() + { + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<2, 0>>, + sequence<1, 2>, + sequence<0, 1>>{}); + } + + // Extract tile data into a local float array via static_for (compile-time indices). + template + CK_TILE_DEVICE static void tile_to_float(const Tile& tile, float (&out)[BufSize]) + { + static_assert(Tile::get_thread_buffer_size() == BufSize); + const auto& buf = tile.get_thread_buffer(); + static_for<0, BufSize, 1>{}([&](auto i) { out[i.value] = type_convert(buf[i]); }); + } + + // Column-wise (dim=0) sum: accumulate SeqPerThread rows into KPerThread partial sums, + // then xor-shuffle across m_idx within warp. + template + CK_TILE_DEVICE static void column_reduce_thread_and_warp(const float* __restrict__ data, + float (&col_acc)[KPerThread]) + { + for(index_t k = 0; k < KPerThread; ++k) + col_acc[k] = 0.f; + + for(index_t m = 0; m < SeqPerThread; ++m) + for(index_t k = 0; k < KPerThread; ++k) + col_acc[k] += data[m * KPerThread + k]; + + for(index_t stride = KThreads; stride < WarpSize; stride *= 2) + for(index_t k = 0; k < KPerThread; ++k) + col_acc[k] += warp_shuffle(col_acc[k], __lane_id() ^ stride); + } + + // Cross-warp LDS reduction for column sums. + CK_TILE_DEVICE static void column_reduce_cross_warp(float (&col_acc)[KPerThread], + float* __restrict__ smem_reduce) + { + const index_t tid = static_cast(threadIdx.x); + const index_t warp_id = tid / WarpSize; + const index_t lane_id = tid % WarpSize; + const index_t k_idx = lane_id % KThreads; + const index_t m_idx = lane_id / KThreads; + + if(m_idx == 0) + for(index_t k = 0; k < KPerThread; ++k) + smem_reduce[warp_id * D + k_idx * KPerThread + k] = col_acc[k]; + __syncthreads(); + + for(index_t k = 0; k < KPerThread; ++k) + col_acc[k] = 0.f; + for(index_t w = 0; w < NumWarps; ++w) + for(index_t k = 0; k < KPerThread; ++k) + col_acc[k] += smem_reduce[w * D + k_idx * KPerThread + k]; + __syncthreads(); + } + + // Compute ||v||^2 per row: sum along KPerThread then xor-shuffle across k_idx. + template + CK_TILE_DEVICE static void row_reduce_sq_norm(const float* __restrict__ data, + float (&row_norms)[SeqPerThread], + index_t actual_seq) + { + const index_t tid = static_cast(threadIdx.x); + const index_t warp_id = tid / WarpSize; + const index_t m_idx = (tid % WarpSize) / KThreads; + + for(index_t m = 0; m < SeqPerThread; ++m) + { + float sq = 0.f; + for(index_t k = 0; k < KPerThread; ++k) + { + float v = data[m * KPerThread + k]; + sq += v * v; + } + for(index_t stride = 1; stride < KThreads; stride *= 2) + sq += warp_shuffle(sq, __lane_id() ^ stride); + + index_t gsq = m * (SeqThreadPerWarp * NumWarps) + warp_id * SeqThreadPerWarp + m_idx; + row_norms[m] = (gsq < actual_seq) ? sq : 0.f; + } + } + + // Column reduce of normalised rows: sum_hat[d] = sum_i data[i,d] / ||data[i,:]||. + template + CK_TILE_DEVICE static void column_reduce_normalised(const float* __restrict__ data, + const float* __restrict__ row_norms, + float (&col_acc)[KPerThread], + index_t actual_seq) + { + const index_t tid = static_cast(threadIdx.x); + const index_t warp_id = tid / WarpSize; + const index_t m_idx = (tid % WarpSize) / KThreads; + + for(index_t k = 0; k < KPerThread; ++k) + col_acc[k] = 0.f; + + for(index_t m = 0; m < SeqPerThread; ++m) + { + float inv_norm = (row_norms[m] > 0.f) ? (1.0f / __builtin_sqrtf(row_norms[m])) : 0.f; + index_t gsq = m * (SeqThreadPerWarp * NumWarps) + warp_id * SeqThreadPerWarp + m_idx; + if(gsq < actual_seq) + for(index_t k = 0; k < KPerThread; ++k) + col_acc[k] += data[m * KPerThread + k] * inv_norm; + } + + for(index_t stride = KThreads; stride < WarpSize; stride *= 2) + for(index_t k = 0; k < KPerThread; ++k) + col_acc[k] += warp_shuffle(col_acc[k], __lane_id() ^ stride); + } + + // Scalar reduce across k_idx lanes (within warp). + CK_TILE_DEVICE static float reduce_across_k(float v) + { + for(index_t stride = 1; stride < KThreads; stride *= 2) + v += warp_shuffle(v, __lane_id() ^ stride); + return v; + } + + // Full-block scalar reduce (warp xor + cross-warp LDS). + CK_TILE_DEVICE static float block_reduce_sum(float v, float* smem_small) + { + const index_t tid = static_cast(threadIdx.x); + const index_t warp_id = tid / WarpSize; + const index_t lane_id = tid % WarpSize; + + for(index_t stride = 1; stride < WarpSize; stride *= 2) + v += warp_shuffle(v, __lane_id() ^ stride); + if(lane_id == 0) + smem_small[warp_id] = v; + __syncthreads(); + if(tid == 0) + { + float s = 0.f; + for(index_t w = 0; w < NumWarps; ++w) + s += smem_small[w]; + smem_small[0] = s; + } + __syncthreads(); + return smem_small[0]; + } + + CK_TILE_DEVICE static float block_reduce_max(float v, float* smem_small) + { + const index_t tid = static_cast(threadIdx.x); + const index_t warp_id = tid / WarpSize; + const index_t lane_id = tid % WarpSize; + + for(index_t stride = 1; stride < WarpSize; stride *= 2) + v = max(v, warp_shuffle(v, __lane_id() ^ stride)); + if(lane_id == 0) + smem_small[warp_id] = v; + __syncthreads(); + if(tid == 0) + { + float s = smem_small[0]; + for(index_t w = 1; w < NumWarps; ++w) + s = max(s, smem_small[w]); + smem_small[0] = s; + } + __syncthreads(); + return smem_small[0]; + } + + // ====================================================================== + template + CK_TILE_DEVICE void operator()(const QWindowType& q_window_in, + const KWindowType& k_window_in, + index_t seqlen_q, + index_t seqlen_k, + index_t qb, + index_t N_k, + index_t /*nhead_ratio_qk*/, + float simthreshd1, + float cdfthreshd, + float topk, + float scale, + uint8_t* block_map_ptr, + int32_t* lut_ptr, + int32_t* valid_block_num_ptr, + void* smem_ptr) const + { + const index_t tid = static_cast(threadIdx.x); + + auto* smem_float = reinterpret_cast(smem_ptr); + auto* smem_scores = + reinterpret_cast(reinterpret_cast(smem_ptr) + kScoreOffset); + auto* smem_bmap = + reinterpret_cast(reinterpret_cast(smem_ptr) + kBmapOffset); + auto* smem_small = + reinterpret_cast(reinterpret_cast(smem_ptr) + kSmallOffset); + + const index_t bs_q = min(static_cast(kM0), seqlen_q - qb * kM0); + const float inv_bs_q = (bs_q > 0) ? (1.0f / static_cast(bs_q)) : 0.f; + + // ================================================================== + // Phase 1: Q Block Statistics + // ================================================================== + auto q_tile = load_tile(q_window_in); + + float q_data[MPerThread * KPerThread]; + tile_to_float(q_tile, q_data); + + // 1a. L2 norm per token + float psq[MPerThread]; + row_reduce_sq_norm(q_data, psq, bs_q); + + // 1b. Column sum -> mean + float pooled_q_mean[KPerThread]; + column_reduce_thread_and_warp(q_data, pooled_q_mean); + column_reduce_cross_warp(pooled_q_mean, smem_float); + for(index_t k = 0; k < KPerThread; ++k) + pooled_q_mean[k] *= inv_bs_q; + + // 1c. Normalised sum_hat + float sum_hat[KPerThread]; + column_reduce_normalised(q_data, psq, sum_hat, bs_q); + column_reduce_cross_warp(sum_hat, smem_float); + + // 1d. sim_q = ||sum_hat||^2 / bs_q^2 + float sh_sq = 0.f; + for(index_t k = 0; k < KPerThread; ++k) + sh_sq += sum_hat[k] * sum_hat[k]; + sh_sq = reduce_across_k(sh_sq); + const float denom_q = static_cast(bs_q) * static_cast(bs_q); + const bool sim_q = (denom_q > 0.f) && ((sh_sq / denom_q) > simthreshd1); + + // Not similar → force all K blocks ON, early exit + if(!sim_q) + { + for(index_t i = tid; i < N_k; i += kBlockSize) + block_map_ptr[i] = 1; + + if(lut_ptr != nullptr && tid == 0) + { + int32_t valid = 0, prev = 0; + for(index_t kb = 0; kb < N_k; ++kb) + { + lut_ptr[valid] = static_cast(kb) - prev; + prev = static_cast(kb); + ++valid; + } + for(index_t i = valid; i < N_k; ++i) + lut_ptr[i] = 0; + *valid_block_num_ptr = valid; + } + return; + } + + // ================================================================== + // Phase 2: K Block Loop + // ================================================================== + for(index_t i = tid; i < N_k; i += kBlockSize) + smem_bmap[i] = 0; + __syncthreads(); + + auto k_window = k_window_in; + + for(index_t kb = 0; kb < N_k; ++kb) + { + const index_t bs_k = min(static_cast(kN0), seqlen_k - kb * kN0); + const float inv_bs_k = (bs_k > 0) ? (1.0f / static_cast(bs_k)) : 0.f; + + auto k_tile = load_tile(k_window); + + float k_data[NPerThread * KPerThread]; + tile_to_float(k_tile, k_data); + + // K mean + float pooled_k_mean[KPerThread]; + column_reduce_thread_and_warp(k_data, pooled_k_mean); + column_reduce_cross_warp(pooled_k_mean, smem_float); + for(index_t k = 0; k < KPerThread; ++k) + pooled_k_mean[k] *= inv_bs_k; + + // dot(pooled_q_mean, pooled_k_mean) + float dot = 0.f; + for(index_t k = 0; k < KPerThread; ++k) + dot += pooled_q_mean[k] * pooled_k_mean[k]; + dot = reduce_across_k(dot); + + // K L2 norms + normalised sum_hat + float k_psq[NPerThread]; + row_reduce_sq_norm(k_data, k_psq, bs_k); + + float k_sum_hat[KPerThread]; + column_reduce_normalised(k_data, k_psq, k_sum_hat, bs_k); + column_reduce_cross_warp(k_sum_hat, smem_float); + + // sim_k + float ksh_sq = 0.f; + for(index_t k = 0; k < KPerThread; ++k) + ksh_sq += k_sum_hat[k] * k_sum_hat[k]; + ksh_sq = reduce_across_k(ksh_sq); + const float denom_k = static_cast(bs_k) * static_cast(bs_k); + const bool sim_k = (denom_k > 0.f) && ((ksh_sq / denom_k) > simthreshd1); + + if(tid == 0) + { + if(!sim_k) + { + smem_bmap[kb] = 1; + smem_scores[kb] = -numeric::infinity(); + } + else + { + smem_scores[kb] = dot * scale; + } + } + __syncthreads(); + + move_tile_window(k_window, {kN0, 0}); + } + + // ================================================================== + // Phase 3: Softmax + Selection + // ================================================================== + + // max + float lmax = -numeric::infinity(); + for(index_t i = tid; i < N_k; i += kBlockSize) + lmax = max(lmax, smem_scores[i]); + const float max_score = block_reduce_max(lmax, smem_small); + + // exp + sum + float lsum = 0.f; + for(index_t i = tid; i < N_k; i += kBlockSize) + { + float e = (smem_scores[i] > -numeric::infinity()) + ? __builtin_expf(smem_scores[i] - max_score) + : 0.f; + smem_scores[i] = e; + lsum += e; + } + const float sum_exp = block_reduce_sum(lsum, smem_small); + + // normalise + const float inv_sum = (sum_exp > 0.f) ? (1.0f / sum_exp) : 0.f; + for(index_t i = tid; i < N_k; i += kBlockSize) + smem_scores[i] *= inv_sum; + __syncthreads(); + + // Selection: iterative argmax + index_t num_to_select = + (topk > 0.f) + ? max(static_cast(1), static_cast(topk * static_cast(N_k))) + : N_k; + + float cumulative_prob = 0.f; + for(index_t round = 0; round < num_to_select; ++round) + { + // thread-local argmax + float best_val = -1.f; + index_t best_idx = 0; + for(index_t i = tid; i < N_k; i += kBlockSize) + { + if(smem_scores[i] > best_val || (smem_scores[i] == best_val && i < best_idx)) + { + best_val = smem_scores[i]; + best_idx = i; + } + } + + // warp argmax + for(index_t stride = 1; stride < WarpSize; stride *= 2) + { + float rv = warp_shuffle(best_val, __lane_id() ^ stride); + index_t ri = warp_shuffle(best_idx, __lane_id() ^ stride); + if(rv > best_val || (rv == best_val && ri < best_idx)) + { + best_val = rv; + best_idx = ri; + } + } + + // cross-warp argmax via LDS + const index_t lane_id = tid % WarpSize; + const index_t warp_id = tid / WarpSize; + if(lane_id == 0) + { + smem_small[warp_id] = best_val; + smem_small[NumWarps + warp_id] = bit_cast(static_cast(best_idx)); + } + __syncthreads(); + + if(tid == 0) + { + float bv = smem_small[0]; + index_t bi = bit_cast(smem_small[NumWarps]); + for(index_t w = 1; w < NumWarps; ++w) + { + float wv = smem_small[w]; + index_t wi = bit_cast(smem_small[NumWarps + w]); + if(wv > bv || (wv == bv && wi < bi)) + { + bv = wv; + bi = wi; + } + } + smem_small[0] = bv; + smem_small[1] = bit_cast(static_cast(bi)); + } + __syncthreads(); + + float g_val = smem_small[0]; + index_t g_idx = bit_cast(smem_small[1]); + + if(g_val <= 0.f) + break; + + if(tid == 0) + { + smem_bmap[g_idx] = 1; + smem_scores[g_idx] = -1.f; + } + __syncthreads(); + + if(topk > 0.f) + { + if(round + 1 >= num_to_select) + break; + } + else + { + cumulative_prob += g_val; + if(cumulative_prob >= cdfthreshd) + break; + } + } + + // ================================================================== + // Write outputs to global memory + // ================================================================== + for(index_t i = tid; i < N_k; i += kBlockSize) + block_map_ptr[i] = smem_bmap[i]; + + if(lut_ptr != nullptr && tid == 0) + { + int32_t valid = 0, prev = 0; + for(index_t kb = 0; kb < N_k; ++kb) + { + if(smem_bmap[kb] != 0) + { + lut_ptr[valid] = static_cast(kb) - prev; + prev = static_cast(kb); + ++valid; + } + } + for(index_t i = valid; i < N_k; ++i) + lut_ptr[i] = 0; + *valid_block_num_ptr = valid; + } + } +}; + +} // namespace ck_tile From c7e6e4f616b483f2a9aafd3e8d00238f02de77e5 Mon Sep 17 00:00:00 2001 From: Gino Lu Date: Tue, 14 Apr 2026 10:11:00 -0400 Subject: [PATCH 4/7] fix extra host side operations. --- example/ck_tile/50_sparse_attn/CMakeLists.txt | 4 - .../50_sparse_attn/sparge_blockmap.cpp | 156 --------- .../ck_tile/50_sparse_attn/sparge_blockmap.h | 26 -- .../test_sparge_vsa_sparse_attn.cpp | 296 ++++++++++-------- .../50_sparse_attn/vsa_sparge_attention.cpp | 195 ------------ .../50_sparse_attn/vsa_sparge_attention.h | 28 -- 6 files changed, 164 insertions(+), 541 deletions(-) delete mode 100644 example/ck_tile/50_sparse_attn/sparge_blockmap.cpp delete mode 100644 example/ck_tile/50_sparse_attn/sparge_blockmap.h delete mode 100644 example/ck_tile/50_sparse_attn/vsa_sparge_attention.cpp delete mode 100644 example/ck_tile/50_sparse_attn/vsa_sparge_attention.h diff --git a/example/ck_tile/50_sparse_attn/CMakeLists.txt b/example/ck_tile/50_sparse_attn/CMakeLists.txt index 169ed87ac3b..f234f631b6b 100644 --- a/example/ck_tile/50_sparse_attn/CMakeLists.txt +++ b/example/ck_tile/50_sparse_attn/CMakeLists.txt @@ -249,14 +249,12 @@ set(SPARGE_VSA_INSTANCES "tile_sparge_vsa_instances") add_library(${SPARGE_VSA_INSTANCES} OBJECT EXCLUDE_FROM_ALL ${SPARGE_VSA_GEN_BLOBS} - ${CMAKE_CURRENT_LIST_DIR}/vsa_sparge_attention.cpp ) target_include_directories(${SPARGE_VSA_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR} ${PROJECT_SOURCE_DIR}/include/ck_tile/ops/sparse_attn ) set_source_files_properties(${SPARGE_VSA_GEN_BLOBS} PROPERTIES LANGUAGE HIP) -set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/vsa_sparge_attention.cpp PROPERTIES LANGUAGE HIP) set_property(TARGET ${SPARGE_VSA_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS}) target_compile_options(${SPARGE_VSA_INSTANCES} PRIVATE @@ -273,7 +271,6 @@ set(SPARGE_BLOCKMAP_INSTANCES "tile_sparge_blockmap_instances") add_library(${SPARGE_BLOCKMAP_INSTANCES} OBJECT EXCLUDE_FROM_ALL ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap_inst.cpp - ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap.cpp ) target_include_directories(${SPARGE_BLOCKMAP_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR} @@ -281,7 +278,6 @@ target_include_directories(${SPARGE_BLOCKMAP_INSTANCES} PRIVATE ) set_source_files_properties( ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap_inst.cpp - ${CMAKE_CURRENT_LIST_DIR}/sparge_blockmap.cpp PROPERTIES LANGUAGE HIP ) set_property(TARGET ${SPARGE_BLOCKMAP_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS}) diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap.cpp b/example/ck_tile/50_sparse_attn/sparge_blockmap.cpp deleted file mode 100644 index b9ac56c533c..00000000000 --- a/example/ck_tile/50_sparse_attn/sparge_blockmap.cpp +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -#include "sparge_blockmap.h" -#include "sparge_blockmap_trek.hpp" -#include "ck_tile/core.hpp" -#include "ck_tile/host/host_tensor.hpp" -#include "ck_tile/host/device_memory.hpp" -#include -#include - -template -sparge::VSALut sparge_blockmap_gpu(const ck_tile::HostTensor& TQ, - const ck_tile::HostTensor& TK, - ck_tile::HostTensor& block_map_out, - int batch, - int nhead_q, - int nhead_k, - int seqlen_q, - int seqlen_k, - int hdim_q, - bool i_perm, - float simthreshd1, - float cdfthreshd, - float topk, - int blkq, - int blkk, - int log_level) -{ - static_assert(std::is_same_v || - std::is_same_v, - "sparge_blockmap_gpu supports fp16/bf16 only."); - - std::string data_type = "fp16"; - if constexpr(std::is_same_v) - { - data_type = "bf16"; - } - - const ck_tile::index_t num_q_blocks = ck_tile::integer_divide_ceil(seqlen_q, blkq); - const ck_tile::index_t num_k_blocks = ck_tile::integer_divide_ceil(seqlen_k, blkk); - - const float scale = 1.0f / std::sqrt(static_cast(hdim_q)); - - // Allocate device memory - ck_tile::DeviceMem q_buf(TQ.get_element_space_size_in_bytes()); - ck_tile::DeviceMem k_buf(TK.get_element_space_size_in_bytes()); - - const std::size_t bmap_bytes = - static_cast(batch) * nhead_q * num_q_blocks * num_k_blocks * sizeof(uint8_t); - const std::size_t lut_bytes = - static_cast(batch) * nhead_q * num_q_blocks * num_k_blocks * sizeof(int32_t); - const std::size_t valid_bytes = - static_cast(batch) * nhead_q * num_q_blocks * sizeof(int32_t); - - ck_tile::DeviceMem bmap_buf(bmap_bytes); - ck_tile::DeviceMem lut_buf(lut_bytes); - ck_tile::DeviceMem valid_buf(valid_bytes); - - q_buf.ToDevice(TQ.data()); - k_buf.ToDevice(TK.data()); - bmap_buf.SetZero(); - lut_buf.SetZero(); - valid_buf.SetZero(); - - // Compute strides (assumes BHSD if i_perm, BSHD otherwise) - const ck_tile::index_t stride_q = i_perm ? hdim_q : nhead_q * hdim_q; - const ck_tile::index_t stride_k = i_perm ? hdim_q : nhead_k * hdim_q; - const ck_tile::index_t nhead_stride_q = - i_perm ? static_cast(seqlen_q) * hdim_q : hdim_q; - const ck_tile::index_t nhead_stride_k = - i_perm ? static_cast(seqlen_k) * hdim_q : hdim_q; - const ck_tile::index_t batch_stride_q = - static_cast(nhead_q) * seqlen_q * hdim_q; - const ck_tile::index_t batch_stride_k = - static_cast(nhead_k) * seqlen_k * hdim_q; - - ck_tile::stream_config stream_config{nullptr, false, log_level, 0, 1, false}; - - sparge_blockmap_args args; - args.q_ptr = q_buf.GetDeviceBuffer(); - args.k_ptr = k_buf.GetDeviceBuffer(); - args.batch = batch; - args.seqlen_q = seqlen_q; - args.seqlen_k = seqlen_k; - args.hdim_q = hdim_q; - args.nhead_q = nhead_q; - args.nhead_k = nhead_k; - args.stride_q = stride_q; - args.stride_k = stride_k; - args.nhead_stride_q = nhead_stride_q; - args.nhead_stride_k = nhead_stride_k; - args.batch_stride_q = batch_stride_q; - args.batch_stride_k = batch_stride_k; - args.simthreshd1 = simthreshd1; - args.cdfthreshd = cdfthreshd; - args.topk = topk; - args.scale = scale; - args.block_map_ptr = bmap_buf.GetDeviceBuffer(); - args.lut_ptr = lut_buf.GetDeviceBuffer(); - args.valid_block_num_ptr = valid_buf.GetDeviceBuffer(); - - sparge_blockmap_traits traits; - traits.data_type = data_type; - traits.hdim_q = hdim_q; - - sparge_blockmap_fwd(traits, args, stream_config); - - // Copy results back to host - bmap_buf.FromDevice(block_map_out.data(), bmap_bytes); - - sparge::VSALut vsa_lut{ - ck_tile::HostTensor({batch, nhead_q, num_q_blocks, num_k_blocks}), - ck_tile::HostTensor({batch, nhead_q, num_q_blocks}), - }; - lut_buf.FromDevice(vsa_lut.lut.data(), lut_bytes); - valid_buf.FromDevice(vsa_lut.valid_block_num.data(), valid_bytes); - - return vsa_lut; -} - -// Explicit template instantiations -template sparge::VSALut -sparge_blockmap_gpu(const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - ck_tile::HostTensor&, - int, - int, - int, - int, - int, - int, - bool, - float, - float, - float, - int, - int, - int); - -template sparge::VSALut -sparge_blockmap_gpu(const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - ck_tile::HostTensor&, - int, - int, - int, - int, - int, - int, - bool, - float, - float, - float, - int, - int, - int); diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap.h b/example/ck_tile/50_sparse_attn/sparge_blockmap.h deleted file mode 100644 index 3057257ca14..00000000000 --- a/example/ck_tile/50_sparse_attn/sparge_blockmap.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -#pragma once - -#include -#include "ck_tile/core.hpp" -#include "ck_tile/host/host_tensor.hpp" -#include "sparge_tool.hpp" - -template -sparge::VSALut sparge_blockmap_gpu(const ck_tile::HostTensor& TQ, - const ck_tile::HostTensor& TK, - ck_tile::HostTensor& block_map_out, - int batch, - int nhead_q, - int nhead_k, - int seqlen_q, - int seqlen_k, - int hdim_q, - bool i_perm, - float simthreshd1, - float cdfthreshd, - float topk, - int blkq, - int blkk, - int log_level = 0); diff --git a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp index 638a867b0f3..572b708f9ef 100644 --- a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp +++ b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp @@ -1,23 +1,17 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Demo: Sparge block-map -> (delta LUT) -> VSA sparse attention +// Demo: Sparge block-map -> (delta LUT) -> VSA sparse attention (all-in-device) #include -#include #include -#include #include -#include -#include -#include - #include "ck_tile/host.hpp" #include "ck_tile/core.hpp" #include "ck_tile/host/reference/reference_blocked_attention.hpp" #include "ck_tile/core/utility/bit_cast.hpp" -#include "vsa_sparge_attention.h" -#include "sparge_blockmap.h" +#include "sparge_blockmap_trek.hpp" +#include "fmha_fwd_trek.hpp" #include "sparge_tool.hpp" // ============================================================================ @@ -192,7 +186,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) << ", topk=" << topk << ")" << std::endl; std::cout << " i_perm: " << i_perm << ", o_perm: " << o_perm << std::endl; - // Create host tensors + // Create host tensors and fill with random data ck_tile::HostTensor q_host = make_qkv_tensor(batch, nhead, seqlen_q, hdim_q, i_perm); ck_tile::HostTensor k_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_q, i_perm); ck_tile::HostTensor v_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_v, i_perm); @@ -206,119 +200,157 @@ bool run_test(const ck_tile::ArgParser& arg_parser) ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 2}(v_host); // ================================================================== - // GPU: Build block map + VSA LUT in one kernel (always run) + // Allocate device memory once, HtoD once // ================================================================== - std::cout << "Building Sparge block map + VSA LUT (GPU)..." << std::endl; - ck_tile::HostTensor block_map_gpu({batch, nhead, num_q_blocks, num_k_blocks}); - auto vsa_lut_gpu = sparge_blockmap_gpu(q_host, - k_host, - block_map_gpu, - batch, - nhead, - nhead_k, - seqlen_q, - seqlen_k, - hdim_q, - i_perm, - simthreshd1, - cdfthreshd, - topk, - static_cast(BLKQ), - static_cast(BLKK), - 0); + ck_tile::DeviceMem q_buf(q_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem k_buf(k_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem v_buf(v_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem o_buf(output_host.get_element_space_size_in_bytes()); + + q_buf.ToDevice(q_host.data()); + k_buf.ToDevice(k_host.data()); + v_buf.ToDevice(v_host.data()); + + const std::size_t bmap_bytes = + static_cast(batch) * nhead * num_q_blocks * num_k_blocks * sizeof(uint8_t); + const std::size_t lut_bytes = + static_cast(batch) * nhead * num_q_blocks * num_k_blocks * sizeof(int32_t); + const std::size_t valid_bytes = + static_cast(batch) * nhead * num_q_blocks * sizeof(int32_t); + + ck_tile::DeviceMem bmap_buf(bmap_bytes); + ck_tile::DeviceMem lut_buf(lut_bytes); + ck_tile::DeviceMem valid_buf(valid_bytes); + bmap_buf.SetZero(); + lut_buf.SetZero(); + valid_buf.SetZero(); // ================================================================== - // VSA sparse attention kernel (always run) + // Common stride calculations // ================================================================== - std::cout << "\n--- Running VSA sparse attention kernel ---" << std::endl; + assert(nhead % nhead_k == 0); + const float scale_s = 1.0f / std::sqrt(static_cast(hdim_q)); + + const ck_tile::index_t stride_q = i_perm ? hdim_q : nhead * hdim_q; + const ck_tile::index_t stride_k = i_perm ? hdim_q : nhead_k * hdim_q; + const ck_tile::index_t stride_v = i_perm ? hdim_v : nhead_k * hdim_v; + const ck_tile::index_t stride_o = o_perm ? hdim_v : nhead * hdim_v; + const ck_tile::index_t nhead_stride_q = i_perm ? seqlen_q * hdim_q : hdim_q; + const ck_tile::index_t nhead_stride_k = i_perm ? seqlen_k * hdim_q : hdim_q; + const ck_tile::index_t nhead_stride_v = i_perm ? seqlen_k * hdim_v : hdim_v; + const ck_tile::index_t nhead_stride_o = o_perm ? seqlen_q * hdim_v : hdim_v; + const ck_tile::index_t batch_stride_q = nhead * seqlen_q * hdim_q; + const ck_tile::index_t batch_stride_k = nhead_k * seqlen_k * hdim_q; + const ck_tile::index_t batch_stride_v = nhead_k * hdim_v * seqlen_k; + const ck_tile::index_t batch_stride_o = nhead * seqlen_q * hdim_v; + + std::string data_type = "fp16"; + if constexpr(std::is_same_v) + data_type = "bf16"; - try - { - if(kname) - { - vsa_sparge_attention(q_host, - k_host, - v_host, - vsa_lut_gpu.lut, - vsa_lut_gpu.valid_block_num, - output_host, - batch, - nhead, - nhead_k, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - i_perm, - o_perm, - seqlen_q, - seqlen_k, - 1); - } + std::string msk_str = "0"; + mask_info mask = mask_info::decode(msk_str, seqlen_q, seqlen_k); - for(int i = 0; i < warmup; ++i) - { - vsa_sparge_attention(q_host, - k_host, - v_host, - vsa_lut_gpu.lut, - vsa_lut_gpu.valid_block_num, - output_host, - batch, - nhead, - nhead_k, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - i_perm, - o_perm, - seqlen_q, - seqlen_k, - 0); - } + // ================================================================== + // GPU: Build block map + VSA LUT (always run, device-only) + // ================================================================== + std::cout << "Building Sparge block map + VSA LUT (GPU)..." << std::endl; + { + sparge_blockmap_args args; + args.q_ptr = q_buf.GetDeviceBuffer(); + args.k_ptr = k_buf.GetDeviceBuffer(); + args.batch = batch; + args.seqlen_q = seqlen_q; + args.seqlen_k = seqlen_k; + args.hdim_q = hdim_q; + args.nhead_q = nhead; + args.nhead_k = nhead_k; + args.stride_q = stride_q; + args.stride_k = stride_k; + args.nhead_stride_q = nhead_stride_q; + args.nhead_stride_k = nhead_stride_k; + args.batch_stride_q = batch_stride_q; + args.batch_stride_k = batch_stride_k; + args.simthreshd1 = simthreshd1; + args.cdfthreshd = cdfthreshd; + args.topk = topk; + args.scale = scale_s; + args.block_map_ptr = bmap_buf.GetDeviceBuffer(); + args.lut_ptr = lut_buf.GetDeviceBuffer(); + args.valid_block_num_ptr = valid_buf.GetDeviceBuffer(); + + sparge_blockmap_traits traits; + traits.data_type = data_type; + traits.hdim_q = hdim_q; + + sparge_blockmap_fwd(traits, args, ck_tile::stream_config{}); + } - [[maybe_unused]] auto sync_status1 = hipDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); + // ================================================================== + // VSA sparse attention kernel (always run, LUT stays on device) + // ================================================================== + std::cout << "\n--- Running VSA sparse attention kernel ---" << std::endl; - for(int i = 0; i < repeat; ++i) - { - vsa_sparge_attention(q_host, - k_host, - v_host, - vsa_lut_gpu.lut, - vsa_lut_gpu.valid_block_num, - output_host, - batch, - nhead, - nhead_k, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - i_perm, - o_perm, - seqlen_q, - seqlen_k, - 0); - } + fmha_vsa_fwd_args fmha_args; + fmha_args.q_ptr = q_buf.GetDeviceBuffer(); + fmha_args.k_ptr = k_buf.GetDeviceBuffer(); + fmha_args.v_ptr = v_buf.GetDeviceBuffer(); + fmha_args.lut_ptr = lut_buf.GetDeviceBuffer(); + fmha_args.valid_block_num_ptr = valid_buf.GetDeviceBuffer(); + fmha_args.o_ptr = o_buf.GetDeviceBuffer(); + fmha_args.batch = batch; + fmha_args.seqlen_q = seqlen_q; + fmha_args.seqlen_k = seqlen_k; + fmha_args.max_seqlen_q = seqlen_q; + fmha_args.hdim_q = hdim_q; + fmha_args.hdim_v = hdim_v; + fmha_args.nhead_q = nhead; + fmha_args.nhead_k = nhead_k; + fmha_args.scale_s = scale_s; + fmha_args.stride_q = stride_q; + fmha_args.stride_k = stride_k; + fmha_args.stride_v = stride_v; + fmha_args.stride_o = stride_o; + fmha_args.nhead_stride_q = nhead_stride_q; + fmha_args.nhead_stride_k = nhead_stride_k; + fmha_args.nhead_stride_v = nhead_stride_v; + fmha_args.nhead_stride_o = nhead_stride_o; + fmha_args.batch_stride_q = batch_stride_q; + fmha_args.batch_stride_k = batch_stride_k; + fmha_args.batch_stride_v = batch_stride_v; + fmha_args.batch_stride_o = batch_stride_o; + fmha_args.window_size_left = mask.left; + fmha_args.window_size_right = mask.right; + fmha_args.mask_type = static_cast(mask.type); + + fmha_vsa_fwd_traits fmha_traits; + fmha_traits.hdim_q = hdim_q; + fmha_traits.hdim_v = hdim_v; + fmha_traits.data_type = data_type; + fmha_traits.is_v_rowmajor = true; + fmha_traits.mask_type = mask.type; + + ck_tile::stream_config stream_config{nullptr, + true, + /* log_level = */ kname ? 1 : 0, + warmup, + repeat, + false}; + + float avg_time_ms = sparge_vsa_fwd(fmha_traits, fmha_args, stream_config); + + std::cout << "\n>>>> VSA sparse attention average time: " << avg_time_ms << " ms <<<<" + << std::endl; - [[maybe_unused]] auto sync_status2 = hipDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - double avg_time_ms = - std::chrono::duration(end - start).count() / repeat; + // DtoH: attention output (always needed) + o_buf.FromDevice(output_host.data(), output_host.get_element_space_size_in_bytes()); - std::cout << "\n>>>> VSA sparse attention average time: " << avg_time_ms << " ms <<<<" - << std::endl; - } - catch(const std::exception& e) - { - std::cerr << "Error during kernel execution: " << e.what() << std::endl; - return false; - } + // DtoH: block_map (needed for sparsity stats and validation) + ck_tile::HostTensor block_map_gpu({batch, nhead, num_q_blocks, num_k_blocks}); + bmap_buf.FromDevice(block_map_gpu.data(), bmap_bytes); // ================================================================== - // Sparsity statistics (always run, pure CPU read of HostTensor) + // Sparsity statistics (pure CPU, reads block_map HostTensor) // ================================================================== std::size_t total_blocks = 0; std::size_t active_blocks = 0; @@ -366,6 +398,14 @@ bool run_test(const ck_tile::ArgParser& arg_parser) std::cout << "Converting block map to VSA LUT (delta, CPU)..." << std::endl; auto vsa_lut_cpu = sparge::block_map_to_vsa_lut_delta(block_relation_onehot); + // DtoH: LUT + valid_block_num (only for validation) + sparge::VSALut vsa_lut_gpu{ + ck_tile::HostTensor({batch, nhead, num_q_blocks, num_k_blocks}), + ck_tile::HostTensor({batch, nhead, num_q_blocks}), + }; + lut_buf.FromDevice(vsa_lut_gpu.lut.data(), lut_bytes); + valid_buf.FromDevice(vsa_lut_gpu.valid_block_num.data(), valid_bytes); + // Validate block map std::cout << "\n--- Validating GPU block map vs CPU golden ---" << std::endl; { @@ -378,20 +418,16 @@ bool run_test(const ck_tile::ArgParser& arg_parser) { for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) { - if(block_map_gpu(b, h, qb, kb) != - block_relation_onehot(b, h, qb, kb)) + if(block_map_gpu(b, h, qb, kb) != block_relation_onehot(b, h, qb, kb)) { bmap_mismatches++; if(bmap_mismatches <= 10) { std::cout - << " block_map mismatch at [" << b << "," << h << "," - << qb << "," << kb - << "]: GPU=" - << static_cast(block_map_gpu(b, h, qb, kb)) - << " CPU=" - << static_cast( - block_relation_onehot(b, h, qb, kb)) + << " block_map mismatch at [" << b << "," << h << "," << qb + << "," << kb << "]: GPU=" + << static_cast(block_map_gpu(b, h, qb, kb)) << " CPU=" + << static_cast(block_relation_onehot(b, h, qb, kb)) << std::endl; } } @@ -429,28 +465,24 @@ bool run_test(const ck_tile::ArgParser& arg_parser) valid_mismatches++; if(valid_mismatches <= 5) { - std::cout - << " valid_block_num mismatch at [" << b << "," << h - << "," << qb - << "]: GPU=" << vsa_lut_gpu.valid_block_num(b, h, qb) - << " CPU=" << vsa_lut_cpu.valid_block_num(b, h, qb) - << std::endl; + std::cout << " valid_block_num mismatch at [" << b << "," << h + << "," << qb + << "]: GPU=" << vsa_lut_gpu.valid_block_num(b, h, qb) + << " CPU=" << vsa_lut_cpu.valid_block_num(b, h, qb) + << std::endl; } } for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) { - if(vsa_lut_gpu.lut(b, h, qb, kb) != - vsa_lut_cpu.lut(b, h, qb, kb)) + if(vsa_lut_gpu.lut(b, h, qb, kb) != vsa_lut_cpu.lut(b, h, qb, kb)) { lut_mismatches++; if(lut_mismatches <= 10) { std::cout << " LUT mismatch at [" << b << "," << h << "," << qb - << "," << kb - << "]: GPU=" << vsa_lut_gpu.lut(b, h, qb, kb) - << " CPU=" << vsa_lut_cpu.lut(b, h, qb, kb) - << std::endl; + << "," << kb << "]: GPU=" << vsa_lut_gpu.lut(b, h, qb, kb) + << " CPU=" << vsa_lut_cpu.lut(b, h, qb, kb) << std::endl; } } } diff --git a/example/ck_tile/50_sparse_attn/vsa_sparge_attention.cpp b/example/ck_tile/50_sparse_attn/vsa_sparge_attention.cpp deleted file mode 100644 index 5f9c2676ddb..00000000000 --- a/example/ck_tile/50_sparse_attn/vsa_sparge_attention.cpp +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -#include "vsa_sparge_attention.h" -#include "fmha_fwd_trek.hpp" -#include "ck_tile/core.hpp" -#include "ck_tile/host/host_tensor.hpp" -#include "ck_tile/host/device_memory.hpp" -#include - -template -ck_tile::HostTensor -vsa_sparge_attention(const ck_tile::HostTensor& TQ, - const ck_tile::HostTensor& TK, - const ck_tile::HostTensor& TV, - const ck_tile::HostTensor& TKV_block_idx, - const ck_tile::HostTensor& TKV_blocks, - ck_tile::HostTensor& Y, - int batch, - int nhead, - int nhead_k, - int seqlen_q, - int seqlen_k, - int hdim_q, - int hdim_v, - bool i_perm, - bool o_perm, - int max_seqlen_q, - int max_seqlen_k, - int log_level) -{ - static_assert(std::is_same_v || - std::is_same_v, - "VSA sparse attention supports fp16/bf16 only."); - std::string data_type = "fp16"; - if constexpr(std::is_same_v) - { - data_type = "bf16"; - } - - if(max_seqlen_q == 0) - max_seqlen_q = seqlen_q; - if(max_seqlen_k == 0) - max_seqlen_k = seqlen_k; - bool is_v_rowmajor = true; - float scale_s = 1.0 / ck_tile::sqrt(static_cast(hdim_q)); - std::string msk_str = "0"; - mask_info mask = mask_info::decode(msk_str, seqlen_q, seqlen_k); - - const ck_tile::index_t shape_seqlen_q = seqlen_q; - const ck_tile::index_t shape_seqlen_k = seqlen_k; - - ck_tile::stream_config stream_config{nullptr, - false, // time_kernel - log_level, - 0, - 1, - false}; - - ck_tile::DeviceMem q_buf(TQ.get_element_space_size_in_bytes()); - ck_tile::DeviceMem k_buf(TK.get_element_space_size_in_bytes()); - ck_tile::DeviceMem v_buf(TV.get_element_space_size_in_bytes()); - ck_tile::DeviceMem lut_buf(TKV_block_idx.get_element_space_size_in_bytes()); - ck_tile::DeviceMem valid_block_num_buf(TKV_blocks.get_element_space_size_in_bytes()); - ck_tile::DeviceMem o_buf(Y.get_element_space_size_in_bytes()); - - q_buf.ToDevice(TQ.data()); - k_buf.ToDevice(TK.data()); - v_buf.ToDevice(TV.data()); - lut_buf.ToDevice(TKV_block_idx.data()); - valid_block_num_buf.ToDevice(TKV_blocks.data()); - - const auto init_args = [&](auto& args) { - assert(nhead % nhead_k == 0); - const ck_tile::index_t stride_q = (i_perm ? hdim_q : nhead * hdim_q); - const ck_tile::index_t stride_k = (i_perm ? hdim_q : nhead_k * hdim_q); - const ck_tile::index_t stride_v = [&]() { - if(is_v_rowmajor) - return i_perm ? hdim_v : nhead_k * hdim_v; - else - return (i_perm ? shape_seqlen_k : nhead_k * shape_seqlen_k); - }(); - const ck_tile::index_t stride_o = (o_perm ? hdim_v : nhead * hdim_v); - const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q); - const ck_tile::index_t nhead_stride_k = i_perm ? shape_seqlen_k * hdim_q : hdim_q; - const ck_tile::index_t nhead_stride_v = [&]() { - if(is_v_rowmajor) - return i_perm ? shape_seqlen_k * hdim_v : hdim_v; - else - return i_perm ? hdim_v * shape_seqlen_k : shape_seqlen_k; - }(); - const ck_tile::index_t nhead_stride_o = (o_perm ? shape_seqlen_q * hdim_v : hdim_v); - const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q); - const ck_tile::index_t batch_stride_k = nhead_k * shape_seqlen_k * hdim_q; - const ck_tile::index_t batch_stride_v = nhead_k * hdim_v * shape_seqlen_k; - const ck_tile::index_t batch_stride_o = (nhead * shape_seqlen_q * hdim_v); - - args.q_ptr = q_buf.GetDeviceBuffer(); - args.k_ptr = k_buf.GetDeviceBuffer(); - args.v_ptr = v_buf.GetDeviceBuffer(); - args.lut_ptr = lut_buf.GetDeviceBuffer(); - args.valid_block_num_ptr = valid_block_num_buf.GetDeviceBuffer(); - - args.batch = batch; - args.seqlen_q = shape_seqlen_q; - args.hdim_q = hdim_q; - args.hdim_v = hdim_v; - args.nhead_q = nhead; - args.nhead_k = nhead_k; - - args.stride_q = stride_q; - args.stride_k = stride_k; - args.stride_v = stride_v; - args.nhead_stride_q = nhead_stride_q; - args.nhead_stride_k = nhead_stride_k; - args.nhead_stride_v = nhead_stride_v; - args.batch_stride_q = batch_stride_q; - args.batch_stride_k = batch_stride_k; - args.batch_stride_v = batch_stride_v; - - args.o_ptr = o_buf.GetDeviceBuffer(); - - args.seqlen_k = shape_seqlen_k; - args.max_seqlen_q = max_seqlen_q; - - args.scale_s = scale_s; - - args.stride_o = stride_o; - args.nhead_stride_o = nhead_stride_o; - args.batch_stride_o = batch_stride_o; - - args.window_size_left = mask.left; - args.window_size_right = mask.right; - args.mask_type = static_cast(mask.type); - }; - - const auto init_traits = [&](auto& traits) { - traits.hdim_q = hdim_q; - traits.hdim_v = hdim_v; - traits.data_type = data_type; - traits.is_v_rowmajor = is_v_rowmajor; - traits.mask_type = mask.type; - }; - - fmha_vsa_fwd_traits fmha_traits; - init_traits(fmha_traits); - - fmha_vsa_fwd_args args; - init_args(args); - - sparge_vsa_fwd(fmha_traits, args, stream_config); - - o_buf.FromDevice(Y.data(), Y.get_element_space_size_in_bytes()); - - return Y; -} - -template ck_tile::HostTensor -vsa_sparge_attention(const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - ck_tile::HostTensor&, - int, - int, - int, - int, - int, - int, - int, - bool, - bool, - int, - int, - int); - -template ck_tile::HostTensor -vsa_sparge_attention(const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - ck_tile::HostTensor&, - int, - int, - int, - int, - int, - int, - int, - bool, - bool, - int, - int, - int); diff --git a/example/ck_tile/50_sparse_attn/vsa_sparge_attention.h b/example/ck_tile/50_sparse_attn/vsa_sparge_attention.h deleted file mode 100644 index d51a7e8c00b..00000000000 --- a/example/ck_tile/50_sparse_attn/vsa_sparge_attention.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -#pragma once -#include -#include -#include "ck_tile/core.hpp" -#include "ck_tile/host/host_tensor.hpp" - -template -ck_tile::HostTensor -vsa_sparge_attention(const ck_tile::HostTensor& TQ, - const ck_tile::HostTensor& TK, - const ck_tile::HostTensor& TV, - const ck_tile::HostTensor& TKV_block_idx, - const ck_tile::HostTensor& TKV_blocks, - ck_tile::HostTensor& Y, - int batch, - int nhead, - int nhead_k, - int seqlen_q, - int seqlen_k, - int hdim_q, - int hdim_v, - bool i_perm, - bool o_perm, - int max_seqlen_q, - int max_seqlen_k, - int log_level = 0); From ab44b835667e29cf5ba844d2a7bdd52fc9f4cc17 Mon Sep 17 00:00:00 2001 From: Gino Lu Date: Wed, 22 Apr 2026 13:13:37 -0400 Subject: [PATCH 5/7] refactor to combine two kernel --- example/ck_tile/50_sparse_attn/CMakeLists.txt | 131 +-- .../codegen/ops/fmha_fwd_jenga.py | 141 +++- .../codegen/ops/fmha_fwd_vsa.py | 141 +++- .../codegen/ops/sparge_fwd_jenga.py | 799 ------------------ .../codegen/ops/sparge_fwd_vsa.py | 799 ------------------ .../ck_tile/50_sparse_attn/fmha_fwd_trek.hpp | 16 +- .../50_sparse_attn/jenga_sparge_attention.cpp | 189 ----- .../50_sparse_attn/jenga_sparge_attention.h | 27 - .../50_sparse_attn/sparge_blockmap_inst.cpp | 139 +++ .../50_sparse_attn/sparge_blockmap_trek.hpp | 13 + .../ck_tile/50_sparse_attn/test_sparge.cpp | 432 ++++++++++ .../test_sparge_jenga_sparse_attn.cpp | 422 --------- .../test_sparge_vsa_sparse_attn.cpp | 597 ------------- ...ock_fmha_pipeline_qr_ks_vs_async_jenga.hpp | 40 +- 14 files changed, 896 insertions(+), 2990 deletions(-) delete mode 100644 example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_jenga.py delete mode 100644 example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_vsa.py delete mode 100644 example/ck_tile/50_sparse_attn/jenga_sparge_attention.cpp delete mode 100644 example/ck_tile/50_sparse_attn/jenga_sparge_attention.h create mode 100644 example/ck_tile/50_sparse_attn/test_sparge.cpp delete mode 100644 example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp delete mode 100644 example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp diff --git a/example/ck_tile/50_sparse_attn/CMakeLists.txt b/example/ck_tile/50_sparse_attn/CMakeLists.txt index f234f631b6b..b20a661805f 100644 --- a/example/ck_tile/50_sparse_attn/CMakeLists.txt +++ b/example/ck_tile/50_sparse_attn/CMakeLists.txt @@ -88,68 +88,6 @@ target_compile_options(${EXAMPLE_JENGA_SPARSE_ATTN} PRIVATE -Wno-float-equal ) -# ============================================================================ -# Sparge Jenga (64x128 tile) -# ============================================================================ -set(SPARGE_JENGA_CODE_GEN_ARGS - ${CMAKE_CURRENT_LIST_DIR}/generate.py - --api sparge_fwd_jenga - --receipt 600 -) - -execute_process( - COMMAND ${Python3_EXECUTABLE} ${SPARGE_JENGA_CODE_GEN_ARGS} - --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/sparge_jenga_blob_list.txt - RESULT_VARIABLE ret -) -if(ret AND NOT ret EQUAL 0) - message(FATAL_ERROR "Failed to generate Sparge Jenga kernel list") -endif() - -file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/sparge_jenga_blob_list.txt SPARGE_JENGA_GEN_BLOBS) - -add_custom_command( - OUTPUT ${SPARGE_JENGA_GEN_BLOBS} - COMMAND ${Python3_EXECUTABLE} ${SPARGE_JENGA_CODE_GEN_ARGS} - --output_dir ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${CODE_GEN_SCRIPTS} - COMMENT "Generate CK Tile Sparge Jenga kernels" -) - -message(STATUS "Sparge Jenga kernel files to be generated: ${SPARGE_JENGA_GEN_BLOBS}") - -set(SPARGE_JENGA_INSTANCES "tile_sparge_jenga_instances") - -add_library(${SPARGE_JENGA_INSTANCES} OBJECT EXCLUDE_FROM_ALL - ${SPARGE_JENGA_GEN_BLOBS} - ${CMAKE_CURRENT_LIST_DIR}/jenga_sparge_attention.cpp -) -target_include_directories(${SPARGE_JENGA_INSTANCES} PRIVATE - ${CMAKE_CURRENT_LIST_DIR} - ${PROJECT_SOURCE_DIR}/include/ck_tile/ops/sparse_attn -) -set_source_files_properties(${SPARGE_JENGA_GEN_BLOBS} PROPERTIES LANGUAGE HIP) -set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/jenga_sparge_attention.cpp PROPERTIES LANGUAGE HIP) -set_property(TARGET ${SPARGE_JENGA_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS}) - -target_compile_options(${SPARGE_JENGA_INSTANCES} PRIVATE - -DCK_TILE_USE_BUFFER_ADDRESSING_BUILTIN - -DCK_TILE_FMHA_FWD_FAST_EXP2 - -Wno-undefined-func-template - -Wno-float-equal -) - -# Sparge + Jenga Example executable -set(EXAMPLE_SPARGE_JENGA_SPARSE_ATTN "tile_example_sparge_jenga_sparse_attn") -message(DEBUG "adding example ${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN}") -add_executable(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} EXCLUDE_FROM_ALL test_sparge_jenga_sparse_attn.cpp) -target_link_libraries(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} ${SPARGE_JENGA_INSTANCES}) -target_include_directories(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_compile_options(${EXAMPLE_SPARGE_JENGA_SPARSE_ATTN} PRIVATE - -Wno-undefined-func-template - -Wno-float-equal -) - # ============================================================================ # VSA Sparse Attention # ============================================================================ @@ -215,55 +153,6 @@ target_compile_options(${EXAMPLE_VSA_SPARSE_ATTN} PRIVATE -Wno-float-equal ) -# ============================================================================ -# Sparge VSA (64x128 tile) -# ============================================================================ -set(SPARGE_VSA_CODE_GEN_ARGS - ${CMAKE_CURRENT_LIST_DIR}/generate.py - --api sparge_fwd_vsa - --receipt 600 -) - -execute_process( - COMMAND ${Python3_EXECUTABLE} ${SPARGE_VSA_CODE_GEN_ARGS} - --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/sparge_vsa_blob_list.txt - RESULT_VARIABLE ret -) -if(ret AND NOT ret EQUAL 0) - message(FATAL_ERROR "Failed to generate Sparge VSA kernel list") -endif() - -file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/sparge_vsa_blob_list.txt SPARGE_VSA_GEN_BLOBS) - -add_custom_command( - OUTPUT ${SPARGE_VSA_GEN_BLOBS} - COMMAND ${Python3_EXECUTABLE} ${SPARGE_VSA_CODE_GEN_ARGS} - --output_dir ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${CODE_GEN_SCRIPTS} - COMMENT "Generate CK Tile Sparge VSA kernels" -) - -message(STATUS "Sparge VSA kernel files to be generated: ${SPARGE_VSA_GEN_BLOBS}") - -set(SPARGE_VSA_INSTANCES "tile_sparge_vsa_instances") - -add_library(${SPARGE_VSA_INSTANCES} OBJECT EXCLUDE_FROM_ALL - ${SPARGE_VSA_GEN_BLOBS} -) -target_include_directories(${SPARGE_VSA_INSTANCES} PRIVATE - ${CMAKE_CURRENT_LIST_DIR} - ${PROJECT_SOURCE_DIR}/include/ck_tile/ops/sparse_attn -) -set_source_files_properties(${SPARGE_VSA_GEN_BLOBS} PROPERTIES LANGUAGE HIP) -set_property(TARGET ${SPARGE_VSA_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS}) - -target_compile_options(${SPARGE_VSA_INSTANCES} PRIVATE - -DCK_TILE_USE_BUFFER_ADDRESSING_BUILTIN - -DCK_TILE_FMHA_FWD_FAST_EXP2 - -Wno-undefined-func-template - -Wno-float-equal -) - # ============================================================================ # Sparge BlockMap GPU Kernel (hand-written instantiation, no codegen) # ============================================================================ @@ -289,16 +178,20 @@ target_compile_options(${SPARGE_BLOCKMAP_INSTANCES} PRIVATE -Wno-float-equal ) -# Sparge + VSA Example executable (now links blockmap kernel too) -set(EXAMPLE_SPARGE_VSA_SPARSE_ATTN "tile_example_sparge_vsa_sparse_attn") -message(DEBUG "adding example ${EXAMPLE_SPARGE_VSA_SPARSE_ATTN}") -add_executable(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} EXCLUDE_FROM_ALL test_sparge_vsa_sparse_attn.cpp) -target_link_libraries(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} - ${SPARGE_VSA_INSTANCES} +# ---------------------------------------------------------------------------- +# Build unified Sparge test: combines blockmap, Jenga, and VSA attention +# for end-to-end evaluation and timing in a single executable. +# ---------------------------------------------------------------------------- +set(EXAMPLE_SPARGE "tile_example_sparge") +message(DEBUG "adding example ${EXAMPLE_SPARGE}") +add_executable(${EXAMPLE_SPARGE} EXCLUDE_FROM_ALL test_sparge.cpp) +target_link_libraries(${EXAMPLE_SPARGE} + ${SPARSE_ATTN_JENGA_INSTANCES} + ${SPARSE_ATTN_VSA_INSTANCES} ${SPARGE_BLOCKMAP_INSTANCES} ) -target_include_directories(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_compile_options(${EXAMPLE_SPARGE_VSA_SPARSE_ATTN} PRIVATE +target_include_directories(${EXAMPLE_SPARGE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_compile_options(${EXAMPLE_SPARGE} PRIVATE -Wno-undefined-func-template -Wno-float-equal ) diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py index a3d32652a98..1f0a78048d9 100644 --- a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py +++ b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py @@ -141,6 +141,17 @@ def update_file(file_path, content): constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} + +template<> +void fmha_jenga_fwd_oneshot_(const ck_tile::stream_config& s, fmha_jenga_fwd_args a) +{{ + using k_ = fmha_kernel_{F_idx}; + auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); + const dim3 blocks = k_::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; + ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)( + ck_tile::stream_config{{s.stream_id_}}); +}} """ FMHA_FWD_API_FILENAME = "fmha_jenga_fwd_api.cpp" @@ -219,6 +230,45 @@ def update_file(file_path, content): }} """ +FMHA_FWD_ONESHOT_API_FILENAME = "fmha_jenga_fwd_oneshot_api.cpp" +FMHA_FWD_ONESHOT_API = """ +#include "fmha_fwd_trek.hpp" +#include + +void fmha_jenga_fwd_oneshot(fmha_jenga_fwd_traits t, fmha_jenga_fwd_args a, const ck_tile::stream_config& s){{ + + const bool has_load_tr = ck_tile::is_load_tr_supported(); + +{F_dispatch} + std::cerr << "fmha_jenga_fwd_oneshot: no matching dispatch (dtype=" << t.data_type + << " hdim_q=" << t.hdim_q << " hdim_v=" << t.hdim_v + << " seqlen_q=" << a.seqlen_q << " seqlen_k=" << a.seqlen_k + << " mask=" << static_cast(t.mask_type) << ")" << std::endl; +}} +""" + +FMHA_FWD_ONESHOT_API_PER_TRLOAD = """ {F_if}({F_trload_cond}){{ +{F_dtype_case} + }} +""" + +FMHA_FWD_ONESHOT_API_PER_DTYPE = """ {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{ +{F_hdim_case} + }} +""" +FMHA_FWD_ONESHOT_API_PER_HDIM_CASE = """ {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{ +{F_inner_dispatch} + }} +""" + +FMHA_FWD_ONESHOT_API_INNER_DISPATCH = """ {F_if}((t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && + ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{ + using trait_ = fmha_jenga_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, false/*logits*/, {F_mask}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; + fmha_jenga_fwd_oneshot_(s, a); + return; + }} +""" + @dataclass class CppConstraint: @@ -274,10 +324,7 @@ def scheck(self) -> str: @property def seqtune(self) -> str: - if self.bm0 == 128: - return "true/*fall back to largest tile*/" # group mode only generate spad/skpad == true - else: - return f"a.seqlen_q <= {self.bm0}" + return "true" @property def skcheck(self) -> str: @@ -447,6 +494,67 @@ def api(self) -> str: per_tr_load += " (void)t ; (void)s ; (void)a;" return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_tr_load) + @property + def oneshot_api(self) -> str: + tr_load_cond_map = {"t": "has_load_tr", "f": "true"} + + per_tr_load = str() + for tr_load in ["t", "f"]: + per_dtypes = str() + for i, dtype in enumerate(self.pool.keys()): + per_hdim_case = str() + for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()): + traits = [ + t + for t in self.pool[dtype][(hdim, hdim_v)] + if tr_load == t.tr_load + ] + inners = str() + for k, trait in enumerate(traits): + if_k = "if" if k == 0 else "else if" + inners = inners + FMHA_FWD_ONESHOT_API_INNER_DISPATCH.format( + F_if=if_k, + F_vlayout=LAYOUT_MAP[trait.vlayout], + F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], + F_mask=get_mask_map(self.mask_impl)[trait.mask], + F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], + F_trload=BOOL_MAP[trait.tr_load], + F_scheck=trait.scheck, + F_seqtune=trait.seqtune, + F_skcheck=trait.skcheck, + F_dcheck=trait.dcheck, + F_dvcheck=trait.dvcheck, + F_constraint=trait.constraint, + F_spad=BOOL_MAP[trait.spad], + F_skpad=BOOL_MAP[trait.skpad], + F_dpad=BOOL_MAP[trait.dpad], + F_dvpad=BOOL_MAP[trait.dvpad], + F_bm0=trait.bm0, + F_bn0=trait.bn0, + F_bk0=trait.bk0, + F_bn1=trait.bn1, + F_bk1=trait.bk1, + F_bk0max=trait.bk0max, + F_hdim=hdim, + F_dtype=FWD_DTYPE_MAP[dtype], + ) + if_j = "if" if j == 0 else "else if" + per_hdim_case = per_hdim_case + FMHA_FWD_ONESHOT_API_PER_HDIM_CASE.format( + F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners + ) + if_i = "if" if i == 0 else "else if" + per_dtypes = per_dtypes + FMHA_FWD_ONESHOT_API_PER_DTYPE.format( + F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case + ) + per_tr_load += FMHA_FWD_ONESHOT_API_PER_TRLOAD.format( + F_if="if", + F_trload_cond=tr_load_cond_map[tr_load], + F_dtype_case=per_dtypes, + ) + if not per_tr_load: + per_tr_load += " (void)t ; (void)s ; (void)a;" + return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_ONESHOT_API.format(F_dispatch=per_tr_load) + @dataclass class FmhaFwdTileSize: @@ -582,6 +690,27 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: # FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], (128, 128): [ + FmhaFwdTileSize( # fmt: skip -- 64x128 tile matching blockmap kM0=64, kN0=128 + 64, + 128, + 64, + 128, + 64, + 128, + 4, + 1, + 1, + 4, + 1, + 1, + 16, + 16, + 16, + 16, + 16, + 16, + -1, + ), FmhaFwdTileSize( # fmt: skip 16, 32, @@ -780,7 +909,7 @@ def get_fwd_blobs( for tile, pipeline in itertools.product( tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) ): - if tile.F_bm0 != 128 or tile.F_bn0 != 128: + if tile.F_bm0 != 64 or tile.F_bn0 != 128: continue if pipeline.tag != "qr_async": continue @@ -846,6 +975,7 @@ def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None: def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None: update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api) + update_file(autogen_dir / FMHA_FWD_ONESHOT_API_FILENAME, api_pool.oneshot_api) def write_blobs( @@ -865,3 +995,4 @@ def list_blobs( for kernel in kernels: f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n") f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n") + f.write((file_path.parent / GEN_DIR / FMHA_FWD_ONESHOT_API_FILENAME).as_posix() + "\n") diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py index 038738de246..217cfcfe2a4 100644 --- a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py +++ b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py @@ -141,6 +141,17 @@ def update_file(file_path, content): constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} + +template<> +void fmha_vsa_fwd_oneshot_(const ck_tile::stream_config& s, fmha_vsa_fwd_args a) +{{ + using k_ = fmha_kernel_{F_idx}; + auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); + const dim3 blocks = k_::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; + ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)( + ck_tile::stream_config{{s.stream_id_}}); +}} """ FMHA_FWD_API_FILENAME = "fmha_vsa_fwd_api.cpp" @@ -219,6 +230,45 @@ def update_file(file_path, content): }} """ +FMHA_FWD_ONESHOT_API_FILENAME = "fmha_vsa_fwd_oneshot_api.cpp" +FMHA_FWD_ONESHOT_API = """ +#include "fmha_fwd_trek.hpp" +#include + +void fmha_vsa_fwd_oneshot(fmha_vsa_fwd_traits t, fmha_vsa_fwd_args a, const ck_tile::stream_config& s){{ + + const bool has_load_tr = ck_tile::is_load_tr_supported(); + +{F_dispatch} + std::cerr << "fmha_vsa_fwd_oneshot: no matching dispatch (dtype=" << t.data_type + << " hdim_q=" << t.hdim_q << " hdim_v=" << t.hdim_v + << " seqlen_q=" << a.seqlen_q << " seqlen_k=" << a.seqlen_k + << " mask=" << static_cast(t.mask_type) << ")" << std::endl; +}} +""" + +FMHA_FWD_ONESHOT_API_PER_TRLOAD = """ {F_if}({F_trload_cond}){{ +{F_dtype_case} + }} +""" + +FMHA_FWD_ONESHOT_API_PER_DTYPE = """ {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{ +{F_hdim_case} + }} +""" +FMHA_FWD_ONESHOT_API_PER_HDIM_CASE = """ {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{ +{F_inner_dispatch} + }} +""" + +FMHA_FWD_ONESHOT_API_INNER_DISPATCH = """ {F_if}((t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && + ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{ + using trait_ = fmha_vsa_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, false/*logits*/, {F_mask}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; + fmha_vsa_fwd_oneshot_(s, a); + return; + }} +""" + @dataclass class CppConstraint: @@ -274,10 +324,7 @@ def scheck(self) -> str: @property def seqtune(self) -> str: - if self.bm0 == 128: - return "true/*fall back to largest tile*/" # group mode only generate spad/skpad == true - else: - return f"a.seqlen_q <= {self.bm0}" + return "true" @property def skcheck(self) -> str: @@ -447,6 +494,67 @@ def api(self) -> str: per_tr_load += " (void)t ; (void)s ; (void)a;" return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_tr_load) + @property + def oneshot_api(self) -> str: + tr_load_cond_map = {"t": "has_load_tr", "f": "true"} + + per_tr_load = str() + for tr_load in ["t", "f"]: + per_dtypes = str() + for i, dtype in enumerate(self.pool.keys()): + per_hdim_case = str() + for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()): + traits = [ + t + for t in self.pool[dtype][(hdim, hdim_v)] + if tr_load == t.tr_load + ] + inners = str() + for k, trait in enumerate(traits): + if_k = "if" if k == 0 else "else if" + inners = inners + FMHA_FWD_ONESHOT_API_INNER_DISPATCH.format( + F_if=if_k, + F_vlayout=LAYOUT_MAP[trait.vlayout], + F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], + F_mask=get_mask_map(self.mask_impl)[trait.mask], + F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], + F_trload=BOOL_MAP[trait.tr_load], + F_scheck=trait.scheck, + F_seqtune=trait.seqtune, + F_skcheck=trait.skcheck, + F_dcheck=trait.dcheck, + F_dvcheck=trait.dvcheck, + F_constraint=trait.constraint, + F_spad=BOOL_MAP[trait.spad], + F_skpad=BOOL_MAP[trait.skpad], + F_dpad=BOOL_MAP[trait.dpad], + F_dvpad=BOOL_MAP[trait.dvpad], + F_bm0=trait.bm0, + F_bn0=trait.bn0, + F_bk0=trait.bk0, + F_bn1=trait.bn1, + F_bk1=trait.bk1, + F_bk0max=trait.bk0max, + F_hdim=hdim, + F_dtype=FWD_DTYPE_MAP[dtype], + ) + if_j = "if" if j == 0 else "else if" + per_hdim_case = per_hdim_case + FMHA_FWD_ONESHOT_API_PER_HDIM_CASE.format( + F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners + ) + if_i = "if" if i == 0 else "else if" + per_dtypes = per_dtypes + FMHA_FWD_ONESHOT_API_PER_DTYPE.format( + F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case + ) + per_tr_load += FMHA_FWD_ONESHOT_API_PER_TRLOAD.format( + F_if="if", + F_trload_cond=tr_load_cond_map[tr_load], + F_dtype_case=per_dtypes, + ) + if not per_tr_load: + per_tr_load += " (void)t ; (void)s ; (void)a;" + return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_ONESHOT_API.format(F_dispatch=per_tr_load) + @dataclass class FmhaFwdTileSize: @@ -582,6 +690,27 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: # FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], (128, 128): [ + FmhaFwdTileSize( # fmt: skip -- 64x128 tile matching blockmap kM0=64, kN0=128 + 64, + 128, + 64, + 128, + 64, + 128, + 4, + 1, + 1, + 4, + 1, + 1, + 16, + 16, + 16, + 16, + 16, + 16, + -1, + ), FmhaFwdTileSize( # fmt: skip 16, 32, @@ -780,7 +909,7 @@ def get_fwd_blobs( for tile, pipeline in itertools.product( tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) ): - if tile.F_bm0 != 128 or tile.F_bn0 != 128: + if tile.F_bm0 != 64 or tile.F_bn0 != 128: continue if pipeline.tag != "qr_async_vsa": continue @@ -846,6 +975,7 @@ def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None: def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None: update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api) + update_file(autogen_dir / FMHA_FWD_ONESHOT_API_FILENAME, api_pool.oneshot_api) def write_blobs( @@ -865,3 +995,4 @@ def list_blobs( for kernel in kernels: f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n") f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n") + f.write((file_path.parent / GEN_DIR / FMHA_FWD_ONESHOT_API_FILENAME).as_posix() + "\n") diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_jenga.py b/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_jenga.py deleted file mode 100644 index 872da2326ea..00000000000 --- a/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_jenga.py +++ /dev/null @@ -1,799 +0,0 @@ -# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -# SPDX-License-Identifier: MIT -# generate kernel instances to speed up compilation - -import copy -from dataclasses import dataclass, field -import fnmatch -import itertools -import os -import os.path as path -from pathlib import Path -from typing import List, Optional, Tuple - -from codegen.cpp_symbol_map import ( - BOOL_MAP, - FWD_DTYPE_MAP, - LAYOUT_MAP, - MODE_MAP, - PIPELINE_ENUM_MAP, - PIPELINE_MAP, - get_mask_check_map, - get_mask_map, -) - -GEN_DIR = "" - - -def update_file(file_path, content): - """Update the file at file_path with the given content if it differs from the existing content. - - It avoids unnecessary touching of the file which triggers rebuilds - """ - - existing_content = "" - if path.exists(file_path): - with open(file_path, "r") as file: - existing_content = file.read() - if existing_content == content: - return - with open(file_path, "w") as file: - file.write(content) - - -DTYPE_BITS = {"fp32": 32, "fp16": 16, "bf16": 16} - -K0_MAX_SUBMAX_MAP = {32: 32, 64: 64, 96: 128, 128: 128, 192: 192, 256: 256} - -FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n -// auto generated by generate.py -#include "ck_tile/ops/fmha/block/variants.hpp" -#include "fmha_fwd_trek.hpp" -#include "pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp" -#include "kernel/fmha_fwd_jenga_kernel.hpp" - -""" - -# NOTE: Jenga sparse attention kernel has the following restrictions enforced by static_assert: -# - Group mode: NOT supported (batch mode only) -# - Bias: NOT supported (NO_BIAS only) -# - LSE output: NOT supported (false only) -# - Dropout: NOT supported (false only) -# - Logits soft-cap: NOT supported (false only) -# - FP8 static quantization: NOT supported (NO_SCALE only) -# The template below hardcodes these unsupported features accordingly. - -FMHA_FWD_KERNEL_BODY = """ -using fmha_dtype_{F_idx} = {F_dtype}; - -using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>; - -using fmha_shape_{F_idx} = ck_tile::TileFmhaShape, - ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>, - ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>, - ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>, - {F_vlayout}>; - -// TileFmhaTraits: spad, skpad, dpad, dvpad, has_logits_soft_cap, bias_enum, -// store_lse, has_dropout, has_randval, quant_scale_enum, occupancy, is_v_rowmajor_skip -using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad}, - {F_skpad}, - {F_dpad}, - {F_dvpad}, - false, // has_logits_soft_cap - NOT supported - ck_tile::BlockAttentionBiasEnum::NO_BIAS, // bias - NOT supported - false, // store_lse - NOT supported - false, // has_dropout - NOT supported - false, // has_randval - NOT supported - ck_tile::BlockAttentionQuantScaleEnum::NO_SCALE, // FP8 quant - NOT supported - {F_occupancy}, - false>; - -using fmha_variant_{F_idx} = ck_tile::ComposedAttention<0, CK_TILE_FMHA_FWD_FAST_EXP2>; // logits_soft_cap=0 (NOT supported) - -using fmha_mask_{F_idx} = {F_mask}; - -using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem< - typename FmhaSparseFwdTypeConfig::QDataType, - typename FmhaSparseFwdTypeConfig::KDataType, - typename FmhaSparseFwdTypeConfig::VDataType, - typename FmhaSparseFwdTypeConfig::SaccDataType, - typename FmhaSparseFwdTypeConfig::SMPLComputeDataType, - typename FmhaSparseFwdTypeConfig::BiasDataType, - typename FmhaSparseFwdTypeConfig::RandValOutputDataType, - typename FmhaSparseFwdTypeConfig::LSEDataType, - typename FmhaSparseFwdTypeConfig::PDataType, - typename FmhaSparseFwdTypeConfig::OaccDataType, - typename FmhaSparseFwdTypeConfig::ODataType, - fmha_shape_{F_idx}, - {F_mode}, - fmha_variant_{F_idx}, - fmha_mask_{F_idx}, - {F_trload}, - fmha_trait_{F_idx}>; - -using fmha_pipeline_{F_idx} = {F_pipeline}< - fmha_pipeline_problem_{F_idx}>; - -using fmha_epilogue_{F_idx} = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaSparseFwdTypeConfig<{F_dtype}>::ODataType, - {F_spad}, {F_dvpad}>>; - -using fmha_kernel_{F_idx} = - ck_tile::FmhaFwdJengaKernel; - -using trait_{F_idx} = fmha_jenga_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, - {F_pipeline_enum}, false/*logits*/, fmha_mask_{F_idx}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; - -#include - -template<> -float fmha_jenga_fwd_(const ck_tile::stream_config& s, fmha_jenga_fwd_args a) -{{ - using k_ = fmha_kernel_{F_idx}; - if(s.log_level_ > 0) - std::cout << ", " << "{F_kernel_name}" << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - const dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); -}} -""" - -FMHA_FWD_API_FILENAME = "sparge_jenga_fwd_api.cpp" -FMHA_FWD_API = """ -#include - -#include - -namespace {{ -bool get_num_cus(unsigned& num_cus) {{ - int device; - auto status = hipGetDevice(&device); - if(status != hipSuccess) {{ - fprintf(stderr, "failed to get device"); - return false; - }} - - hipDeviceProp_t props{{}}; - status = hipGetDeviceProperties(&props, device); - if(status != hipSuccess) {{ - fprintf(stderr, "failed to get device properties"); - return false; - }} - - num_cus = props.multiProcessorCount; - return true; -}} - -unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{ - const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0; - const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1 - - return batch * nheads * num_m_blocks * num_n_blocks; -}} -}} // namespace - -float sparge_jenga_fwd(fmha_jenga_fwd_traits t, fmha_jenga_fwd_args a, const ck_tile::stream_config& s){{ - float r = -1; - - [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate - - unsigned num_cus; - if (!get_num_cus(num_cus)) {{ - return r; - }} - - [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{ - return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0); - }}; - - const bool has_load_tr = ck_tile::is_load_tr_supported(); - -{F_dispatch} - return r; -}} -""" - -FMHA_FWD_API_PER_TRLOAD = """ {F_if}({F_trload_cond}){{ -{F_dtype_case} - }} -""" - -FMHA_FWD_API_PER_DTYPE = """ {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{ -{F_hdim_case} - }} -""" -FMHA_FWD_API_PER_HDIM_CASE = """ {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{ -{F_inner_dispatch} - }} -""" - -FMHA_FWD_API_INNER_DISPATCH = """ {F_if}((t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && - ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{ - using trait_ = fmha_jenga_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, false/*logits*/, {F_mask}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; - return fmha_jenga_fwd_(s, a); - }} -""" - - -@dataclass -class CppConstraint: - bool_expr: str = None - - def __str__(self): - if self.bool_expr is None: - return "true" - else: - return f"{self.bool_expr}" - - def __and__(self, other): - return CppConstraint(f"({str(self)}) && ({str(other)})") - - -@dataclass -class FmhaFwdApiTrait: - pipeline_tag: str - # sync with fmha_fwd_traits<>, to generate fallback calls - hdim: str - dtype: str # data type - mode: str # value from MODE_MAP - bm0: int # tile size along q seqlen (block size) - bn0: int # tile size along qk seqlen - bk0: int # tile size along qk gemm unroll - bn1: int # tile size along v head_dim - bk1: int # tile size along kv gemm unroll - bk0max: int - vlayout: str - logits: str - mask: str - spad: str - skpad: str - dpad: str - dvpad: str - tr_load: str - constraint: CppConstraint - - @property - def name(self) -> str: - return ( - f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-" - + f"{self.vlayout}-{self.logits}-{self.mask}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}" - ) - - @property - def scheck(self) -> str: - if self.mode == "group": - return "true/*group mode spad always true*/" # group mode only generate spad/skpad == true - if self.spad == "t": - return "true" # always support - return "true" - - @property - def seqtune(self) -> str: - return "true" - - @property - def skcheck(self) -> str: - if self.mode == "group": - return "true/*group mode skpad always true*/" # group mode only generate spad/skpad == true - if self.skpad == "t": - return f"a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0" - return f"a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0" - - @property - def dcheck(self) -> str: - vec = int((32 * 4) / DTYPE_BITS[self.dtype]) - if self.dpad == "t": - return f"a.hdim_q % {vec} == 0" - assert False - - @property - def dvcheck(self) -> str: - vec = int((32 * 4) / DTYPE_BITS[self.dtype]) - if self.dvpad == "t": - return f"a.hdim_v % {vec} == 0" - assert False - - -@dataclass -class FmhaFwdPipeline: - tag: str - - F_vlayout: str # row/col - F_spad: str # true/false - F_skpad: str # - F_dpad: str # - F_dvpad: str # - F_logits: str # t/f - F_mask: str # value from MASK_MAP - F_trload: str # true/false - F_constraint: CppConstraint = field(default_factory=CppConstraint) - - @property - def name(self) -> str: - def pad_name() -> str: - n = "" - if self.F_spad == "t": - n += "s" - if self.F_skpad == "t": - n += "sk" - if self.F_dpad == "t": - n += "d" - if self.F_dvpad == "t": - n += "dv" - if n != "": - n = "p" + n - return n - - pn = pad_name() - n = f"{self.tag}_v{self.F_vlayout[0]}" - if pn != "": - n += f"_{pn}" - else: - n += "_npad" - - if self.F_logits == "t": - n += "_logits" - else: - n += "_nlogits" - - n += "_nbias" - - if self.F_mask[0:2] == "s_": - if self.F_mask == "s_mask": - n += "_mask" - else: - n += "_nmask" - else: - if self.F_mask != "no": - n += f"_m{self.F_mask[0]}" - else: - n += "_nmask" - - n += "_nskip" - - n += "_nsquant" - - if self.F_trload == "t": - n += "_trload" - else: - n += "_ntrload" - - return n - - -class FmhaFwdApiPool: - def __init__(self, mask_impl): - self.pool = dict() - self.mask_impl = mask_impl - - def register_traits(self, trait: FmhaFwdApiTrait) -> None: - # TODO: do we need to check duplication? - if trait.dtype not in self.pool.keys(): - self.pool[trait.dtype] = dict() - hdim = trait.hdim, trait.bn1 - if hdim not in self.pool[trait.dtype].keys(): - self.pool[trait.dtype][hdim] = list() - - self.pool[trait.dtype][hdim].append(copy.copy(trait)) - - @property - def api(self) -> str: - tr_load_cond_map = {"t": "has_load_tr", "f": "true"} - - per_tr_load = str() - for tr_load in ["t", "f"]: - per_dtypes = str() - for i, dtype in enumerate(self.pool.keys()): - per_hdim_case = str() - for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()): - traits = [ - t - for t in self.pool[dtype][(hdim, hdim_v)] - if tr_load == t.tr_load - ] - inners = str() - for k, trait in enumerate(traits): - if_k = "if" if k == 0 else "else if" - inners = inners + FMHA_FWD_API_INNER_DISPATCH.format( - F_if=if_k, - F_vlayout=LAYOUT_MAP[trait.vlayout], - F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], - # F_logits removed - hardcoded to false (NOT supported) - F_mask=get_mask_map(self.mask_impl)[trait.mask], - F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], - F_trload=BOOL_MAP[trait.tr_load], - F_scheck=trait.scheck, - F_seqtune=trait.seqtune, - F_skcheck=trait.skcheck, - F_dcheck=trait.dcheck, - F_dvcheck=trait.dvcheck, - F_constraint=trait.constraint, - F_spad=BOOL_MAP[trait.spad], - F_skpad=BOOL_MAP[trait.skpad], - F_dpad=BOOL_MAP[trait.dpad], - F_dvpad=BOOL_MAP[trait.dvpad], - F_bm0=trait.bm0, - F_bn0=trait.bn0, - F_bk0=trait.bk0, - F_bn1=trait.bn1, - F_bk1=trait.bk1, - F_bk0max=trait.bk0max, - F_hdim=hdim, - F_dtype=FWD_DTYPE_MAP[dtype], - ) - if_j = "if" if j == 0 else "else if" - per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format( - F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners - ) - if_i = "if" if i == 0 else "else if" - per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format( - F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case - ) - per_tr_load += FMHA_FWD_API_PER_TRLOAD.format( - F_if="if", - F_trload_cond=tr_load_cond_map[tr_load], - F_dtype_case=per_dtypes, - ) - if not per_tr_load: - # empty string we add some ignore to suppress warning in api - per_tr_load += " (void)t ; (void)s ; (void)a;" - return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_tr_load) - - -@dataclass -class FmhaFwdTileSize: - F_bm0: int # tile size along q seqlen (block size) - F_bn0: int # tile size along k seqlen - F_bk0: int # tile size along qk gemm unroll - F_bn1: int # tile size along v head_dim - F_bk1: int # tile size along kv gemm unroll - F_bk0max: int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) - F_rm0: int # number of warps for gemm0 along q seqlen - F_rn0: int # number of warps for gemm0 along k seqlen - F_rk0: int # number of warps for gemm0 along head dim q (not used) - F_rm1: int # number of warps for gemm1 along q seqlen - F_rn1: int # number of warps for gemm1 along head dim v - F_rk1: int # number of warps for gemm1 along k seqlen (not used) - F_wm0: int # gemm0 warp size along m - F_wn0: int # gemm0 warp size along n - F_wk0: int # gemm0 warp size along k - F_wm1: int # gemm1 warp size along m - F_wn1: int # gemm1 warp size along n - F_wk1: int # gemm1 warp size along k - F_occupancy: int # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy - F_constraint: CppConstraint = field(default_factory=CppConstraint) - - @property - def name(self) -> str: - return ( - f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" - + f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" - + f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" - + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") - ) - - -@dataclass -class FmhaFwdKernel: - F_idx: int # this is not a tunable, but a counter to differentiate symbol - F_hdim: int # hdim - F_dtype: str # data type - F_mode: str # value from MODE_MAP - F_tile: FmhaFwdTileSize - F_pipeline: FmhaFwdPipeline - mask_impl: str - - @property - def template(self) -> str: - # kernel_body removed - unused - return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_KERNEL_BODY.format( - F_idx=self.F_idx, - F_hdim=self.F_hdim, - F_dtype=FWD_DTYPE_MAP[self.F_dtype], - F_bm0=self.F_tile.F_bm0, - F_bn0=self.F_tile.F_bn0, - F_bk0=self.F_tile.F_bk0, - F_bn1=self.F_tile.F_bn1, - F_bk1=self.F_tile.F_bk1, - F_bk0max=self.F_tile.F_bk0max, - F_rm0=self.F_tile.F_rm0, - F_rn0=self.F_tile.F_rn0, - F_rk0=self.F_tile.F_rk0, - F_rm1=self.F_tile.F_rm1, - F_rn1=self.F_tile.F_rn1, - F_rk1=self.F_tile.F_rk1, - F_wm0=self.F_tile.F_wm0, - F_wn0=self.F_tile.F_wn0, - F_wk0=self.F_tile.F_wk0, - F_wm1=self.F_tile.F_wm1, - F_wn1=self.F_tile.F_wn1, - F_wk1=self.F_tile.F_wk1, - F_vlayout=LAYOUT_MAP[self.F_pipeline.F_vlayout], - F_spad=BOOL_MAP[self.F_pipeline.F_spad], - F_skpad=BOOL_MAP[self.F_pipeline.F_skpad], - F_dpad=BOOL_MAP[self.F_pipeline.F_dpad], - F_dvpad=BOOL_MAP[self.F_pipeline.F_dvpad], - # F_logits removed - hardcoded to false in template (NOT supported) - F_occupancy=self.F_tile.F_occupancy, - F_pipeline_enum=PIPELINE_ENUM_MAP[self.F_pipeline.tag], - F_mask=get_mask_map(self.mask_impl)[self.F_pipeline.F_mask], - F_mode=MODE_MAP[self.F_mode], - F_pipeline=PIPELINE_MAP[self.F_pipeline.tag], - F_trload=BOOL_MAP[self.F_pipeline.F_trload], - F_kernel_name=self.name, - ) - - @property - def name(self) -> str: - # TODO: we don't encode idx here - return ( - f"fmha_jenga_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" - + self.F_tile.name - + "_" - + self.F_pipeline.name - ) - - @property - def filename(self) -> str: - return self.name + ".cpp" - - def api_trait(self) -> FmhaFwdApiTrait: - return FmhaFwdApiTrait( - pipeline_tag=self.F_pipeline.tag, - hdim=str(self.F_hdim), - dtype=self.F_dtype, - mode=self.F_mode, - bm0=self.F_tile.F_bm0, - bn0=self.F_tile.F_bn0, - bk0=self.F_tile.F_bk0, - bn1=self.F_tile.F_bn1, - bk1=self.F_tile.F_bk1, - bk0max=self.F_tile.F_bk0max, - vlayout=self.F_pipeline.F_vlayout, - mask=self.F_pipeline.F_mask, - logits=self.F_pipeline.F_logits, - spad=self.F_pipeline.F_spad, - skpad=self.F_pipeline.F_skpad, - dpad=self.F_pipeline.F_dpad, - dvpad=self.F_pipeline.F_dvpad, - tr_load=self.F_pipeline.F_trload, - constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint, - ) - - -class KernelComponentFactory: - # TODO: design a more practical way to do it - # this is current supported tile size per hdim - @staticmethod - def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: - if dtype == "fp16" or dtype == "bf16": - return { - # (32, 32) : [FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - # (64, 64) : [FmhaFwdTileSize(16, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1), - # FmhaFwdTileSize(32, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1), - # FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - (128, 128): [ - FmhaFwdTileSize( - 64, - 128, - 64, - 128, - 64, - 128, - 4, - 1, - 1, - 4, - 1, - 1, - 16, - 16, - 16, - 16, - 16, - 16, - -1, - ), - ], - # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], - # (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - # (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], - # (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - } - else: - return None - - # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad - # support this in future - @staticmethod - def get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) -> List[FmhaFwdPipeline]: - # this function will populate a list possible pipelines - # TODO: the order of List matters! the later in this list will be also be checked later - # NOTE: logits soft-cap is NOT supported by Jenga sparse attention (enforced by static_assert) - pipelines = [] - if dtype in ["fp16", "bf16"]: - for logits, mask in itertools.product( - ["f"], # logits soft-cap NOT supported, always false - get_mask_map(mask_impl).keys(), - ): - if hdim == 256 and hdim_v == 256: - # jenga fmha only supports dim <= 192 for now. - continue - pipelines.append( - FmhaFwdPipeline( # fmt: skip - "qr_async", - "row", - "t", - "f", - "t", - "t", - logits, - mask, - "f", - ) - ) - pipelines.append( - FmhaFwdPipeline( # fmt: skip - "qr_async", - "row", - "t", - "t", - "t", - "t", - logits, - mask, - "f", - ) - ) - else: - assert False - return pipelines - - -class CustomFactory(KernelComponentFactory): - @staticmethod - def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: - result = KernelComponentFactory.get_hdim_tile_size_dict(dtype) - if dtype == "fp16" or dtype == "bf16": - if (128, 128) in result.keys(): - result[(128, 128)].insert( - 0, - FmhaFwdTileSize( - 64, - 128, - 64, - 128, - 64, - 128, - 4, - 1, - 1, - 4, - 1, - 1, - 16, - 16, - 16, - 16, - 16, - 16, - -1, - CppConstraint( - "get_num_blocks(128) < num_cus * min_cu_util_rate" - ), - ), - ) - return result - - -def get_fwd_blobs( - kernel_filter: Optional[str], receipt, optdim_list, mask_impl -) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]: - gen = list() - api_pool = FmhaFwdApiPool(mask_impl) - - factory = ( - CustomFactory - if os.environ.get("CK_TILE_FMHA_FWD_CUSTOM_FACTORY", "0") == "1" - else KernelComponentFactory - ) - - # Only generate fp16/bf16 kernels for now. - # NOTE: Jenga sparse attention only supports batch mode (group mode NOT supported, enforced by static_assert) - for dtype in ["fp16", "bf16"]: - d = factory.get_hdim_tile_size_dict(dtype) - if d is None: - continue - for ((hdim, hdim_v), tiles), mode in itertools.product(d.items(), ["batch"]): - for tile, pipeline in itertools.product( - tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) - ): - if pipeline.tag != "qr_async": - continue - k = FmhaFwdKernel( - F_idx=2, - F_hdim=hdim, - F_dtype=dtype, - F_mode=mode, - F_tile=tile, - F_pipeline=pipeline, - mask_impl=mask_impl, - ) - if kernel_filter != "": - if not fnmatch.fnmatch(k.name, kernel_filter): - continue - if optdim_list != [-1]: - if hdim not in optdim_list: - continue - # 2 - Flash attention integration - if receipt in (2, 3): - cond = dtype in ["fp16", "bf16"] - cond &= pipeline.F_vlayout == "row" - if not cond: - continue - # PyTorch integration - elif receipt == 4: - cond = dtype in ["fp16", "bf16"] - cond &= pipeline.F_vlayout == "row" - cond &= mode == "batch" - cond &= pipeline.F_logits == "f" - if not cond: - continue - # Aiter(mha_fwd) integration - elif receipt == 100: - cond = dtype in ["fp16", "bf16"] - cond &= mode == "batch" - cond &= pipeline.F_vlayout == "row" - if not cond: - continue - # Aiter(mha_varlen_fwd) integration - elif receipt == 200: - cond = dtype in ["fp16", "bf16"] - cond &= mode == "group" - cond &= pipeline.F_vlayout == "row" - if not cond: - continue - # aiter::mha_fwd C++ api integration - elif receipt == 600: - cond = dtype in ["fp16", "bf16"] - cond &= pipeline.F_vlayout == "row" - if not cond: - continue - - api_pool.register_traits(k.api_trait()) - gen.append(k) - - return (api_pool, gen) - - -def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None: - update_file(autogen_dir / kernel.filename, kernel.template) - - -def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None: - update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api) - - -def write_blobs( - output_dir: Path, kernel_filter: str, receipt, optdim_list, mask_impl -) -> None: - api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl) - for kernel in kernels: - write_single_fwd_kernel(kernel, output_dir) - write_fwd_api(api_pool, output_dir) - - -def list_blobs( - file_path: Path, kernel_filter: str, receipt, optdim_list, mask_impl -) -> None: - with file_path.open("a") as f: - _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl) - for kernel in kernels: - f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n") - f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n") diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_vsa.py b/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_vsa.py deleted file mode 100644 index c9a389df3fa..00000000000 --- a/example/ck_tile/50_sparse_attn/codegen/ops/sparge_fwd_vsa.py +++ /dev/null @@ -1,799 +0,0 @@ -# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -# SPDX-License-Identifier: MIT -# generate kernel instances to speed up compilation - -import copy -from dataclasses import dataclass, field -import fnmatch -import itertools -import os -import os.path as path -from pathlib import Path -from typing import List, Optional, Tuple - -from codegen.cpp_symbol_map import ( - BOOL_MAP, - FWD_DTYPE_MAP, - LAYOUT_MAP, - MODE_MAP, - PIPELINE_ENUM_MAP, - PIPELINE_MAP, - get_mask_check_map, - get_mask_map, -) - -GEN_DIR = "" - - -def update_file(file_path, content): - """Update the file at file_path with the given content if it differs from the existing content. - - It avoids unnecessary touching of the file which triggers rebuilds - """ - - existing_content = "" - if path.exists(file_path): - with open(file_path, "r") as file: - existing_content = file.read() - if existing_content == content: - return - with open(file_path, "w") as file: - file.write(content) - - -DTYPE_BITS = {"fp32": 32, "fp16": 16, "bf16": 16} - -K0_MAX_SUBMAX_MAP = {32: 32, 64: 64, 96: 128, 128: 128, 192: 192, 256: 256} - -FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n -// auto generated by generate.py -#include "ck_tile/ops/fmha/block/variants.hpp" -#include "fmha_fwd_trek.hpp" -#include "pipeline/block_fmha_pipeline_qr_ks_vs_async_vsa.hpp" -#include "kernel/fmha_fwd_vsa_kernel.hpp" - -""" - -# NOTE: VSA sparse attention kernel has the following restrictions enforced by static_assert: -# - Group mode: NOT supported (batch mode only) -# - Bias: NOT supported (NO_BIAS only) -# - LSE output: NOT supported (false only) -# - Dropout: NOT supported (false only) -# - Logits soft-cap: NOT supported (false only) -# - FP8 static quantization: NOT supported (NO_SCALE only) -# The template below hardcodes these unsupported features accordingly. - -FMHA_FWD_KERNEL_BODY = """ -using fmha_dtype_{F_idx} = {F_dtype}; - -using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>; - -using fmha_shape_{F_idx} = ck_tile::TileFmhaShape, - ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>, - ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>, - ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>, - {F_vlayout}>; - -// TileFmhaTraits: spad, skpad, dpad, dvpad, has_logits_soft_cap, bias_enum, -// store_lse, has_dropout, has_randval, quant_scale_enum, occupancy, is_v_rowmajor_skip -using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad}, - {F_skpad}, - {F_dpad}, - {F_dvpad}, - false, // has_logits_soft_cap - NOT supported - ck_tile::BlockAttentionBiasEnum::NO_BIAS, // bias - NOT supported - false, // store_lse - NOT supported - false, // has_dropout - NOT supported - false, // has_randval - NOT supported - ck_tile::BlockAttentionQuantScaleEnum::NO_SCALE, // FP8 quant - NOT supported - {F_occupancy}, - false>; - -using fmha_variant_{F_idx} = ck_tile::ComposedAttention<0, CK_TILE_FMHA_FWD_FAST_EXP2>; // logits_soft_cap=0 (NOT supported) - -using fmha_mask_{F_idx} = {F_mask}; - -using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem< - typename FmhaSparseFwdTypeConfig::QDataType, - typename FmhaSparseFwdTypeConfig::KDataType, - typename FmhaSparseFwdTypeConfig::VDataType, - typename FmhaSparseFwdTypeConfig::SaccDataType, - typename FmhaSparseFwdTypeConfig::SMPLComputeDataType, - typename FmhaSparseFwdTypeConfig::BiasDataType, - typename FmhaSparseFwdTypeConfig::RandValOutputDataType, - typename FmhaSparseFwdTypeConfig::LSEDataType, - typename FmhaSparseFwdTypeConfig::PDataType, - typename FmhaSparseFwdTypeConfig::OaccDataType, - typename FmhaSparseFwdTypeConfig::ODataType, - fmha_shape_{F_idx}, - {F_mode}, - fmha_variant_{F_idx}, - fmha_mask_{F_idx}, - {F_trload}, - fmha_trait_{F_idx}>; - -using fmha_pipeline_{F_idx} = ck_tile::BlockFmhaPipelineQRKSVSAsyncVSA< - fmha_pipeline_problem_{F_idx}>; - -using fmha_epilogue_{F_idx} = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaSparseFwdTypeConfig<{F_dtype}>::ODataType, - {F_spad}, {F_dvpad}>>; - -using fmha_kernel_{F_idx} = - ck_tile::FmhaFwdVSAKernel; - -using trait_{F_idx} = fmha_vsa_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, - {F_pipeline_enum}, false/*logits*/, fmha_mask_{F_idx}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; - -#include - -template<> -float fmha_vsa_fwd_(const ck_tile::stream_config& s, fmha_vsa_fwd_args a) -{{ - using k_ = fmha_kernel_{F_idx}; - if(s.log_level_ > 0) - std::cout << ", " << "{F_kernel_name}" << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - const dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); -}} -""" - -FMHA_FWD_API_FILENAME = "sparge_vsa_fwd_api.cpp" -FMHA_FWD_API = """ -#include - -#include - -namespace {{ -bool get_num_cus(unsigned& num_cus) {{ - int device; - auto status = hipGetDevice(&device); - if(status != hipSuccess) {{ - fprintf(stderr, "failed to get device"); - return false; - }} - - hipDeviceProp_t props{{}}; - status = hipGetDeviceProperties(&props, device); - if(status != hipSuccess) {{ - fprintf(stderr, "failed to get device properties"); - return false; - }} - - num_cus = props.multiProcessorCount; - return true; -}} - -unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{ - const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0; - const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1 - - return batch * nheads * num_m_blocks * num_n_blocks; -}} -}} // namespace - -float sparge_vsa_fwd(fmha_vsa_fwd_traits t, fmha_vsa_fwd_args a, const ck_tile::stream_config& s){{ - float r = -1; - - [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate - - unsigned num_cus; - if (!get_num_cus(num_cus)) {{ - return r; - }} - - [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{ - return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0); - }}; - - const bool has_load_tr = ck_tile::is_load_tr_supported(); - -{F_dispatch} - return r; -}} -""" - -FMHA_FWD_API_PER_TRLOAD = """ {F_if}({F_trload_cond}){{ -{F_dtype_case} - }} -""" - -FMHA_FWD_API_PER_DTYPE = """ {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{ -{F_hdim_case} - }} -""" -FMHA_FWD_API_PER_HDIM_CASE = """ {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{ -{F_inner_dispatch} - }} -""" - -FMHA_FWD_API_INNER_DISPATCH = """ {F_if}((t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && - ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{ - using trait_ = fmha_vsa_fwd_traits_<{F_hdim}, {F_dtype}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, false/*logits*/, {F_mask}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}>; - return fmha_vsa_fwd_(s, a); - }} -""" - - -@dataclass -class CppConstraint: - bool_expr: str = None - - def __str__(self): - if self.bool_expr is None: - return "true" - else: - return f"{self.bool_expr}" - - def __and__(self, other): - return CppConstraint(f"({str(self)}) && ({str(other)})") - - -@dataclass -class FmhaFwdApiTrait: - pipeline_tag: str - # sync with fmha_fwd_traits<>, to generate fallback calls - hdim: str - dtype: str # data type - mode: str # value from MODE_MAP - bm0: int # tile size along q seqlen (block size) - bn0: int # tile size along qk seqlen - bk0: int # tile size along qk gemm unroll - bn1: int # tile size along v head_dim - bk1: int # tile size along kv gemm unroll - bk0max: int - vlayout: str - logits: str - mask: str - spad: str - skpad: str - dpad: str - dvpad: str - tr_load: str - constraint: CppConstraint - - @property - def name(self) -> str: - return ( - f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-" - + f"{self.vlayout}-{self.logits}-{self.mask}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}" - ) - - @property - def scheck(self) -> str: - if self.mode == "group": - return "true/*group mode spad always true*/" # group mode only generate spad/skpad == true - if self.spad == "t": - return "true" # always support - return "true" - - @property - def seqtune(self) -> str: - return "true" - - @property - def skcheck(self) -> str: - if self.mode == "group": - return "true/*group mode skpad always true*/" # group mode only generate spad/skpad == true - if self.skpad == "t": - return f"a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0" - return f"a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0" - - @property - def dcheck(self) -> str: - vec = int((32 * 4) / DTYPE_BITS[self.dtype]) - if self.dpad == "t": - return f"a.hdim_q % {vec} == 0" - assert False - - @property - def dvcheck(self) -> str: - vec = int((32 * 4) / DTYPE_BITS[self.dtype]) - if self.dvpad == "t": - return f"a.hdim_v % {vec} == 0" - assert False - - -@dataclass -class FmhaFwdPipeline: - tag: str - - F_vlayout: str # row/col - F_spad: str # true/false - F_skpad: str # - F_dpad: str # - F_dvpad: str # - F_logits: str # t/f - F_mask: str # value from MASK_MAP - F_trload: str # true/false - F_constraint: CppConstraint = field(default_factory=CppConstraint) - - @property - def name(self) -> str: - def pad_name() -> str: - n = "" - if self.F_spad == "t": - n += "s" - if self.F_skpad == "t": - n += "sk" - if self.F_dpad == "t": - n += "d" - if self.F_dvpad == "t": - n += "dv" - if n != "": - n = "p" + n - return n - - pn = pad_name() - n = f"{self.tag}_v{self.F_vlayout[0]}" - if pn != "": - n += f"_{pn}" - else: - n += "_npad" - - if self.F_logits == "t": - n += "_logits" - else: - n += "_nlogits" - - n += "_nbias" - - if self.F_mask[0:2] == "s_": - if self.F_mask == "s_mask": - n += "_mask" - else: - n += "_nmask" - else: - if self.F_mask != "no": - n += f"_m{self.F_mask[0]}" - else: - n += "_nmask" - - n += "_nskip" - - n += "_nsquant" - - if self.F_trload == "t": - n += "_trload" - else: - n += "_ntrload" - - return n - - -class FmhaFwdApiPool: - def __init__(self, mask_impl): - self.pool = dict() - self.mask_impl = mask_impl - - def register_traits(self, trait: FmhaFwdApiTrait) -> None: - # TODO: do we need to check duplication? - if trait.dtype not in self.pool.keys(): - self.pool[trait.dtype] = dict() - hdim = trait.hdim, trait.bn1 - if hdim not in self.pool[trait.dtype].keys(): - self.pool[trait.dtype][hdim] = list() - - self.pool[trait.dtype][hdim].append(copy.copy(trait)) - - @property - def api(self) -> str: - tr_load_cond_map = {"t": "has_load_tr", "f": "true"} - - per_tr_load = str() - for tr_load in ["t", "f"]: - per_dtypes = str() - for i, dtype in enumerate(self.pool.keys()): - per_hdim_case = str() - for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()): - traits = [ - t - for t in self.pool[dtype][(hdim, hdim_v)] - if tr_load == t.tr_load - ] - inners = str() - for k, trait in enumerate(traits): - if_k = "if" if k == 0 else "else if" - inners = inners + FMHA_FWD_API_INNER_DISPATCH.format( - F_if=if_k, - F_vlayout=LAYOUT_MAP[trait.vlayout], - F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], - # F_logits removed - hardcoded to false (NOT supported) - F_mask=get_mask_map(self.mask_impl)[trait.mask], - F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], - F_trload=BOOL_MAP[trait.tr_load], - F_scheck=trait.scheck, - F_seqtune=trait.seqtune, - F_skcheck=trait.skcheck, - F_dcheck=trait.dcheck, - F_dvcheck=trait.dvcheck, - F_constraint=trait.constraint, - F_spad=BOOL_MAP[trait.spad], - F_skpad=BOOL_MAP[trait.skpad], - F_dpad=BOOL_MAP[trait.dpad], - F_dvpad=BOOL_MAP[trait.dvpad], - F_bm0=trait.bm0, - F_bn0=trait.bn0, - F_bk0=trait.bk0, - F_bn1=trait.bn1, - F_bk1=trait.bk1, - F_bk0max=trait.bk0max, - F_hdim=hdim, - F_dtype=FWD_DTYPE_MAP[dtype], - ) - if_j = "if" if j == 0 else "else if" - per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format( - F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners - ) - if_i = "if" if i == 0 else "else if" - per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format( - F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case - ) - per_tr_load += FMHA_FWD_API_PER_TRLOAD.format( - F_if="if", - F_trload_cond=tr_load_cond_map[tr_load], - F_dtype_case=per_dtypes, - ) - if not per_tr_load: - # empty string we add some ignore to suppress warning in api - per_tr_load += " (void)t ; (void)s ; (void)a;" - return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_tr_load) - - -@dataclass -class FmhaFwdTileSize: - F_bm0: int # tile size along q seqlen (block size) - F_bn0: int # tile size along k seqlen - F_bk0: int # tile size along qk gemm unroll - F_bn1: int # tile size along v head_dim - F_bk1: int # tile size along kv gemm unroll - F_bk0max: int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) - F_rm0: int # number of warps for gemm0 along q seqlen - F_rn0: int # number of warps for gemm0 along k seqlen - F_rk0: int # number of warps for gemm0 along head dim q (not used) - F_rm1: int # number of warps for gemm1 along q seqlen - F_rn1: int # number of warps for gemm1 along head dim v - F_rk1: int # number of warps for gemm1 along k seqlen (not used) - F_wm0: int # gemm0 warp size along m - F_wn0: int # gemm0 warp size along n - F_wk0: int # gemm0 warp size along k - F_wm1: int # gemm1 warp size along m - F_wn1: int # gemm1 warp size along n - F_wk1: int # gemm1 warp size along k - F_occupancy: int # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy - F_constraint: CppConstraint = field(default_factory=CppConstraint) - - @property - def name(self) -> str: - return ( - f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" - + f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" - + f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" - + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}") - ) - - -@dataclass -class FmhaFwdKernel: - F_idx: int # this is not a tunable, but a counter to differentiate symbol - F_hdim: int # hdim - F_dtype: str # data type - F_mode: str # value from MODE_MAP - F_tile: FmhaFwdTileSize - F_pipeline: FmhaFwdPipeline - mask_impl: str - - @property - def template(self) -> str: - # kernel_body removed - unused - return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_KERNEL_BODY.format( - F_idx=self.F_idx, - F_hdim=self.F_hdim, - F_dtype=FWD_DTYPE_MAP[self.F_dtype], - F_bm0=self.F_tile.F_bm0, - F_bn0=self.F_tile.F_bn0, - F_bk0=self.F_tile.F_bk0, - F_bn1=self.F_tile.F_bn1, - F_bk1=self.F_tile.F_bk1, - F_bk0max=self.F_tile.F_bk0max, - F_rm0=self.F_tile.F_rm0, - F_rn0=self.F_tile.F_rn0, - F_rk0=self.F_tile.F_rk0, - F_rm1=self.F_tile.F_rm1, - F_rn1=self.F_tile.F_rn1, - F_rk1=self.F_tile.F_rk1, - F_wm0=self.F_tile.F_wm0, - F_wn0=self.F_tile.F_wn0, - F_wk0=self.F_tile.F_wk0, - F_wm1=self.F_tile.F_wm1, - F_wn1=self.F_tile.F_wn1, - F_wk1=self.F_tile.F_wk1, - F_vlayout=LAYOUT_MAP[self.F_pipeline.F_vlayout], - F_spad=BOOL_MAP[self.F_pipeline.F_spad], - F_skpad=BOOL_MAP[self.F_pipeline.F_skpad], - F_dpad=BOOL_MAP[self.F_pipeline.F_dpad], - F_dvpad=BOOL_MAP[self.F_pipeline.F_dvpad], - # F_logits removed - hardcoded to false in template (NOT supported) - F_occupancy=self.F_tile.F_occupancy, - F_pipeline_enum=PIPELINE_ENUM_MAP[self.F_pipeline.tag], - F_mask=get_mask_map(self.mask_impl)[self.F_pipeline.F_mask], - F_mode=MODE_MAP[self.F_mode], - F_pipeline=PIPELINE_MAP[self.F_pipeline.tag], - F_trload=BOOL_MAP[self.F_pipeline.F_trload], - F_kernel_name=self.name, - ) - - @property - def name(self) -> str: - # TODO: we don't encode idx here - return ( - f"fmha_vsa_fwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" - + self.F_tile.name - + "_" - + self.F_pipeline.name - ) - - @property - def filename(self) -> str: - return self.name + ".cpp" - - def api_trait(self) -> FmhaFwdApiTrait: - return FmhaFwdApiTrait( - pipeline_tag=self.F_pipeline.tag, - hdim=str(self.F_hdim), - dtype=self.F_dtype, - mode=self.F_mode, - bm0=self.F_tile.F_bm0, - bn0=self.F_tile.F_bn0, - bk0=self.F_tile.F_bk0, - bn1=self.F_tile.F_bn1, - bk1=self.F_tile.F_bk1, - bk0max=self.F_tile.F_bk0max, - vlayout=self.F_pipeline.F_vlayout, - mask=self.F_pipeline.F_mask, - logits=self.F_pipeline.F_logits, - spad=self.F_pipeline.F_spad, - skpad=self.F_pipeline.F_skpad, - dpad=self.F_pipeline.F_dpad, - dvpad=self.F_pipeline.F_dvpad, - tr_load=self.F_pipeline.F_trload, - constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint, - ) - - -class KernelComponentFactory: - # TODO: design a more practical way to do it - # this is current supported tile size per hdim - @staticmethod - def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: - if dtype == "fp16" or dtype == "bf16": - return { - # (32, 32) : [FmhaFwdTileSize(128, 64, 16, 32, 32, 32, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - # (64, 64) : [FmhaFwdTileSize(16, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 32, -1), - # FmhaFwdTileSize(32, 32, 64, 64, 32, 64, 1, 1, 1, 1, 1, 1, 32, 32, 16, 32, 32, 16, -1), - # FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - (128, 128): [ - FmhaFwdTileSize( - 64, - 128, - 64, - 128, - 64, - 128, - 4, - 1, - 1, - 4, - 1, - 1, - 16, - 16, - 16, - 16, - 16, - 16, - -1, - ), - ], - # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], - # (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - # (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], - # (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32, 256, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], - } - else: - return None - - # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad - # support this in future - @staticmethod - def get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) -> List[FmhaFwdPipeline]: - # this function will populate a list possible pipelines - # TODO: the order of List matters! the later in this list will be also be checked later - # NOTE: logits soft-cap is NOT supported by VSA sparse attention (enforced by static_assert) - pipelines = [] - if dtype in ["fp16", "bf16"]: - for logits, mask in itertools.product( - ["f"], # logits soft-cap NOT supported, always false - get_mask_map(mask_impl).keys(), - ): - if hdim == 256 and hdim_v == 256: - # vsa fmha only supports dim <= 192 for now. - continue - pipelines.append( - FmhaFwdPipeline( - "qr_async_vsa", - "row", - "t", - "f", - "t", - "t", - logits, - mask, - "f", - ) - ) - pipelines.append( - FmhaFwdPipeline( - "qr_async_vsa", - "row", - "t", - "t", - "t", - "t", - logits, - mask, - "f", - ) - ) - else: - assert False - return pipelines - - -class CustomFactory(KernelComponentFactory): - @staticmethod - def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: - result = KernelComponentFactory.get_hdim_tile_size_dict(dtype) - if dtype == "fp16" or dtype == "bf16": - if (128, 128) in result.keys(): - result[(128, 128)].insert( - 0, - FmhaFwdTileSize( - 64, - 128, - 64, - 128, - 64, - 128, - 4, - 1, - 1, - 4, - 1, - 1, - 16, - 16, - 16, - 16, - 16, - 16, - -1, - CppConstraint( - "get_num_blocks(128) < num_cus * min_cu_util_rate" - ), - ), - ) - return result - - -def get_fwd_blobs( - kernel_filter: Optional[str], receipt, optdim_list, mask_impl -) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]: - gen = list() - api_pool = FmhaFwdApiPool(mask_impl) - - factory = ( - CustomFactory - if os.environ.get("CK_TILE_FMHA_FWD_CUSTOM_FACTORY", "0") == "1" - else KernelComponentFactory - ) - - # Only generate fp16/bf16 kernels for now. - # NOTE: VSA sparse attention only supports batch mode (group mode NOT supported, enforced by static_assert) - for dtype in ["fp16", "bf16"]: - d = factory.get_hdim_tile_size_dict(dtype) - if d is None: - continue - for ((hdim, hdim_v), tiles), mode in itertools.product(d.items(), ["batch"]): - for tile, pipeline in itertools.product( - tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) - ): - if pipeline.tag != "qr_async_vsa": - continue - k = FmhaFwdKernel( - F_idx=1, - F_hdim=hdim, - F_dtype=dtype, - F_mode=mode, - F_tile=tile, - F_pipeline=pipeline, - mask_impl=mask_impl, - ) - if kernel_filter != "": - if not fnmatch.fnmatch(k.name, kernel_filter): - continue - if optdim_list != [-1]: - if hdim not in optdim_list: - continue - # 2 - Flash attention integration - if receipt in (2, 3): - cond = dtype in ["fp16", "bf16"] - cond &= pipeline.F_vlayout == "row" - if not cond: - continue - # PyTorch integration - elif receipt == 4: - cond = dtype in ["fp16", "bf16"] - cond &= pipeline.F_vlayout == "row" - cond &= mode == "batch" - cond &= pipeline.F_logits == "f" - if not cond: - continue - # Aiter(mha_fwd) integration - elif receipt == 100: - cond = dtype in ["fp16", "bf16"] - cond &= mode == "batch" - cond &= pipeline.F_vlayout == "row" - if not cond: - continue - # Aiter(mha_varlen_fwd) integration - elif receipt == 200: - cond = dtype in ["fp16", "bf16"] - cond &= mode == "group" - cond &= pipeline.F_vlayout == "row" - if not cond: - continue - # aiter::mha_fwd C++ api integration - elif receipt == 600: - cond = dtype in ["fp16", "bf16"] - cond &= pipeline.F_vlayout == "row" - if not cond: - continue - - api_pool.register_traits(k.api_trait()) - gen.append(k) - - return (api_pool, gen) - - -def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None: - update_file(autogen_dir / kernel.filename, kernel.template) - - -def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None: - update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api) - - -def write_blobs( - output_dir: Path, kernel_filter: str, receipt, optdim_list, mask_impl -) -> None: - api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl) - for kernel in kernels: - write_single_fwd_kernel(kernel, output_dir) - write_fwd_api(api_pool, output_dir) - - -def list_blobs( - file_path: Path, kernel_filter: str, receipt, optdim_list, mask_impl -) -> None: - with file_path.open("a") as f: - _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl) - for kernel in kernels: - f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n") - f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n") diff --git a/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp b/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp index 25e3513d2fa..350d1803f66 100644 --- a/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp +++ b/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp @@ -277,13 +277,13 @@ struct fmha_jenga_fwd_traits float fmha_jenga_fwd(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::stream_config&); -// sparge jenga -float sparge_jenga_fwd(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::stream_config&); - template float fmha_jenga_fwd_(const ck_tile::stream_config&, fmha_jenga_fwd_args); -float fmha_jenga_fwd(fmha_jenga_fwd_args, const ck_tile::stream_config&); +template +void fmha_jenga_fwd_oneshot_(const ck_tile::stream_config&, fmha_jenga_fwd_args); + +void fmha_jenga_fwd_oneshot(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::stream_config&); // VSA uses the same traits structure as Jenga; aliases for clarity template float fmha_vsa_fwd_(const ck_tile::stream_config&, fmha_vsa_fwd_args); -float fmha_vsa_fwd(fmha_vsa_fwd_args, const ck_tile::stream_config&); +template +void fmha_vsa_fwd_oneshot_(const ck_tile::stream_config&, fmha_vsa_fwd_args); + +void fmha_vsa_fwd_oneshot(fmha_vsa_fwd_traits, fmha_vsa_fwd_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/50_sparse_attn/jenga_sparge_attention.cpp b/example/ck_tile/50_sparse_attn/jenga_sparge_attention.cpp deleted file mode 100644 index 88f3e08204e..00000000000 --- a/example/ck_tile/50_sparse_attn/jenga_sparge_attention.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -#include "jenga_sparge_attention.h" -#include "fmha_fwd_trek.hpp" -#include "ck_tile/core.hpp" -#include "ck_tile/host/host_tensor.hpp" -#include "ck_tile/host/device_memory.hpp" -#include - -template -ck_tile::HostTensor -jenga_sparge_attention(const ck_tile::HostTensor& TQ, - const ck_tile::HostTensor& TK, - const ck_tile::HostTensor& TV, - const ck_tile::HostTensor& Tblock_relation_onehot, - ck_tile::HostTensor& Y, - int batch, - int nhead, - int nhead_k, - int seqlen_q, - int seqlen_k, - int hdim_q, - int hdim_v, - bool i_perm, - bool o_perm, - int max_seqlen_q, - int max_seqlen_k, - int log_level) -{ - static_assert(std::is_same_v || - std::is_same_v, - "Jenga sparse attention supports fp16/bf16 only."); - std::string data_type = "fp16"; - if constexpr(std::is_same_v) - { - data_type = "bf16"; - } - - if(max_seqlen_q == 0) - max_seqlen_q = seqlen_q; - if(max_seqlen_k == 0) - max_seqlen_k = seqlen_k; - bool is_v_rowmajor = true; - float scale_s = 1.0 / ck_tile::sqrt(static_cast(hdim_q)); - std::string msk_str = "0"; - mask_info mask = mask_info::decode(msk_str, seqlen_q, seqlen_k); - - const ck_tile::index_t shape_seqlen_q = seqlen_q; - const ck_tile::index_t shape_seqlen_k = seqlen_k; - - ck_tile::stream_config stream_config{nullptr, - false, // time_kernel - log_level, - 0, - 1, - false}; - - ck_tile::DeviceMem q_buf(TQ.get_element_space_size_in_bytes()); - ck_tile::DeviceMem k_buf(TK.get_element_space_size_in_bytes()); - ck_tile::DeviceMem v_buf(TV.get_element_space_size_in_bytes()); - ck_tile::DeviceMem block_relation_buf(Tblock_relation_onehot.get_element_space_size_in_bytes()); - ck_tile::DeviceMem o_buf(Y.get_element_space_size_in_bytes()); - - q_buf.ToDevice(TQ.data()); - k_buf.ToDevice(TK.data()); - v_buf.ToDevice(TV.data()); - block_relation_buf.ToDevice(Tblock_relation_onehot.data()); - - const auto init_args = [&](auto& args) { - assert(nhead % nhead_k == 0); - const ck_tile::index_t stride_q = (i_perm ? hdim_q : nhead * hdim_q); - const ck_tile::index_t stride_k = (i_perm ? hdim_q : nhead_k * hdim_q); - const ck_tile::index_t stride_v = [&]() { - if(is_v_rowmajor) - return i_perm ? hdim_v : nhead_k * hdim_v; - else - return (i_perm ? shape_seqlen_k : nhead_k * shape_seqlen_k); - }(); - const ck_tile::index_t stride_o = (o_perm ? hdim_v : nhead * hdim_v); - const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q); - const ck_tile::index_t nhead_stride_k = i_perm ? shape_seqlen_k * hdim_q : hdim_q; - const ck_tile::index_t nhead_stride_v = [&]() { - if(is_v_rowmajor) - return i_perm ? shape_seqlen_k * hdim_v : hdim_v; - else - return i_perm ? hdim_v * shape_seqlen_k : shape_seqlen_k; - }(); - const ck_tile::index_t nhead_stride_o = (o_perm ? shape_seqlen_q * hdim_v : hdim_v); - const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q); - const ck_tile::index_t batch_stride_k = nhead_k * shape_seqlen_k * hdim_q; - const ck_tile::index_t batch_stride_v = nhead_k * hdim_v * shape_seqlen_k; - const ck_tile::index_t batch_stride_o = (nhead * shape_seqlen_q * hdim_v); - - args.q_ptr = q_buf.GetDeviceBuffer(); - args.k_ptr = k_buf.GetDeviceBuffer(); - args.v_ptr = v_buf.GetDeviceBuffer(); - args.block_relation_onehot_ptr = block_relation_buf.GetDeviceBuffer(); - - args.batch = batch; - args.seqlen_q = shape_seqlen_q; - args.hdim_q = hdim_q; - args.hdim_v = hdim_v; - args.nhead_q = nhead; - args.nhead_k = nhead_k; - - args.stride_q = stride_q; - args.stride_k = stride_k; - args.stride_v = stride_v; - args.nhead_stride_q = nhead_stride_q; - args.nhead_stride_k = nhead_stride_k; - args.nhead_stride_v = nhead_stride_v; - args.batch_stride_q = batch_stride_q; - args.batch_stride_k = batch_stride_k; - args.batch_stride_v = batch_stride_v; - - args.o_ptr = o_buf.GetDeviceBuffer(); - - args.seqlen_k = shape_seqlen_k; - args.max_seqlen_q = max_seqlen_q; - - args.scale_s = scale_s; - - args.stride_o = stride_o; - args.nhead_stride_o = nhead_stride_o; - args.batch_stride_o = batch_stride_o; - - args.window_size_left = mask.left; - args.window_size_right = mask.right; - args.mask_type = static_cast(mask.type); - }; - - const auto init_traits = [&](auto& traits) { - traits.hdim_q = hdim_q; - traits.hdim_v = hdim_v; - traits.data_type = data_type; - traits.is_v_rowmajor = is_v_rowmajor; - traits.mask_type = mask.type; - }; - - fmha_jenga_fwd_traits fmha_traits; - init_traits(fmha_traits); - - fmha_jenga_fwd_args args; - init_args(args); - - sparge_jenga_fwd(fmha_traits, args, stream_config); - - o_buf.FromDevice(Y.data(), Y.get_element_space_size_in_bytes()); - - return Y; -} - -template ck_tile::HostTensor -jenga_sparge_attention(const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - ck_tile::HostTensor&, - int, - int, - int, - int, - int, - int, - int, - bool, - bool, - int, - int, - int); - -template ck_tile::HostTensor -jenga_sparge_attention(const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - const ck_tile::HostTensor&, - ck_tile::HostTensor&, - int, - int, - int, - int, - int, - int, - int, - bool, - bool, - int, - int, - int); diff --git a/example/ck_tile/50_sparse_attn/jenga_sparge_attention.h b/example/ck_tile/50_sparse_attn/jenga_sparge_attention.h deleted file mode 100644 index 6259fcc73cf..00000000000 --- a/example/ck_tile/50_sparse_attn/jenga_sparge_attention.h +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -#pragma once -#include -#include -#include "ck_tile/core.hpp" -#include "ck_tile/host/host_tensor.hpp" - -template -ck_tile::HostTensor -jenga_sparge_attention(const ck_tile::HostTensor& TQ, - const ck_tile::HostTensor& TK, - const ck_tile::HostTensor& TV, - const ck_tile::HostTensor& Tblock_relation_onehot, - ck_tile::HostTensor& Y, - int batch, - int nhead, - int nhead_k, - int seqlen_q, - int seqlen_k, - int hdim_q, - int hdim_v, - bool i_perm, - bool o_perm, - int max_seqlen_q, - int max_seqlen_k, - int log_level = 0); diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp b/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp index fbd18b9ff24..a2df5bac569 100644 --- a/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp +++ b/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp @@ -61,6 +61,57 @@ using bmap_fp16_problem = ck_tile::BlockFmhaPipelineProblem; using bmap_fp16_kernel = ck_tile::SpargeBlockMapKernel; +// ============================================================================ +// bf16: D=128, kM0=64, kN0=128 +// ============================================================================ + +using bmap_bf16_block_tile = ck_tile::sequence<64, 128, 128, 128, 128, 128>; + +using bmap_bf16_shape = + ck_tile::TileFmhaShape, + ck_tile::sequence<16, 16, 16>, + ck_tile::sequence<4, 1, 1>, + ck_tile::sequence<16, 16, 16>, + true>; + +using bmap_bf16_trait = ck_tile::TileFmhaTraits; + +using bmap_bf16_variant = ck_tile::ComposedAttention<0, CK_TILE_FMHA_FWD_FAST_EXP2>; +using bmap_bf16_mask = ck_tile::GenericAttentionMask; + +using bmap_bf16_problem = ck_tile::BlockFmhaPipelineProblem; + +using bmap_bf16_pipeline = ck_tile::SpargeBlockMapPipeline; +using bmap_bf16_kernel = ck_tile::SpargeBlockMapKernel; + // ============================================================================ // Dispatch // ============================================================================ @@ -81,8 +132,96 @@ float sparge_blockmap_fwd(sparge_blockmap_traits traits, s, ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)); } + if(traits.data_type == "bf16" && traits.hdim_q == 128) + { + using k_ = bmap_bf16_kernel; + if(s.log_level_ > 0) + std::cout << ", sparge_blockmap_bf16_d128" << std::flush; + auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids(args); + const dim3 blocks = k_::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)); + } + if(s.log_level_ > 0) std::cerr << "sparge_blockmap_fwd: unsupported config (data_type=" << traits.data_type << ", hdim_q=" << traits.hdim_q << ")" << std::endl; return -1.f; } + +// ============================================================================ +// Oneshot version: launches kernel without timing wrapper +// ============================================================================ + +void sparge_blockmap_fwd_oneshot(sparge_blockmap_traits traits, + sparge_blockmap_args args, + const ck_tile::stream_config& s) +{ + if(traits.data_type == "fp16" && traits.hdim_q == 128) + { + using k_ = bmap_fp16_kernel; + auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids(args); + const dim3 blocks = k_::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; + ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)( + ck_tile::stream_config{s.stream_id_}); + return; + } + + if(traits.data_type == "bf16" && traits.hdim_q == 128) + { + using k_ = bmap_bf16_kernel; + auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids(args); + const dim3 blocks = k_::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; + ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)( + ck_tile::stream_config{s.stream_id_}); + return; + } + + std::cerr << "sparge_blockmap_fwd_oneshot: unsupported config (data_type=" << traits.data_type + << ", hdim_q=" << traits.hdim_q << ")" << std::endl; +} + +// ============================================================================ +// Combined functions: blockmap + attention timed together via launch_kernel +// ============================================================================ + +float sparge_jenga_fwd(sparge_blockmap_traits bmap_t, sparge_blockmap_args bmap_a, + fmha_jenga_fwd_traits attn_t, fmha_jenga_fwd_args attn_a, + const ck_tile::stream_config& s) +{ + if(s.log_level_ > 0) + std::cout << ", sparge_blockmap_" << bmap_t.data_type << "_d" << bmap_t.hdim_q + << ", fmha_jenga_fwd_" << attn_t.data_type << "_d" << attn_t.hdim_q + << std::flush; + + return ck_tile::launch_kernel( + s, + [=](const ck_tile::stream_config& s_) { + sparge_blockmap_fwd_oneshot(bmap_t, bmap_a, s_); + }, + [=](const ck_tile::stream_config& s_) { + fmha_jenga_fwd_oneshot(attn_t, attn_a, s_); + }); +} + +float sparge_vsa_fwd_combined(sparge_blockmap_traits bmap_t, sparge_blockmap_args bmap_a, + fmha_vsa_fwd_traits attn_t, fmha_vsa_fwd_args attn_a, + const ck_tile::stream_config& s) +{ + if(s.log_level_ > 0) + std::cout << ", sparge_blockmap_" << bmap_t.data_type << "_d" << bmap_t.hdim_q + << ", fmha_vsa_fwd_" << attn_t.data_type << "_d" << attn_t.hdim_q + << std::flush; + + return ck_tile::launch_kernel( + s, + [=](const ck_tile::stream_config& s_) { + sparge_blockmap_fwd_oneshot(bmap_t, bmap_a, s_); + }, + [=](const ck_tile::stream_config& s_) { + fmha_vsa_fwd_oneshot(attn_t, attn_a, s_); + }); +} diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp b/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp index 1e7e33248a2..6eaeb9ea77b 100644 --- a/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp +++ b/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp @@ -91,3 +91,16 @@ auto sparge_blockmap_create_kargs_and_grids(sparge_blockmap_args args) float sparge_blockmap_fwd(sparge_blockmap_traits traits, sparge_blockmap_args args, const ck_tile::stream_config& stream_config); + +void sparge_blockmap_fwd_oneshot(sparge_blockmap_traits traits, + sparge_blockmap_args args, + const ck_tile::stream_config& stream_config); + +// Combined functions: blockmap + attention with unified timing +float sparge_jenga_fwd(sparge_blockmap_traits, sparge_blockmap_args, + fmha_jenga_fwd_traits, fmha_jenga_fwd_args, + const ck_tile::stream_config&); + +float sparge_vsa_fwd_combined(sparge_blockmap_traits, sparge_blockmap_args, + fmha_vsa_fwd_traits, fmha_vsa_fwd_args, + const ck_tile::stream_config&); diff --git a/example/ck_tile/50_sparse_attn/test_sparge.cpp b/example/ck_tile/50_sparse_attn/test_sparge.cpp new file mode 100644 index 00000000000..7c30a10b062 --- /dev/null +++ b/example/ck_tile/50_sparse_attn/test_sparge.cpp @@ -0,0 +1,432 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +// Unified test for Sparge pipeline: blockmap generation + sparse attention (Jenga/VSA). + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/reference/reference_blocked_attention.hpp" +#include "ck_tile/core/utility/bit_cast.hpp" + +#include "fmha_fwd_trek.hpp" +#include "sparge_blockmap_trek.hpp" +#include "sparge_tool.hpp" + +// ============================================================================ +// Helpers +// ============================================================================ + +template +ck_tile::HostTensor +make_qkv_tensor(ck_tile::index_t batch, ck_tile::index_t nhead, ck_tile::index_t seqlen, ck_tile::index_t hdim, bool i_perm) +{ + if(i_perm) + return ck_tile::HostTensor({batch, nhead, seqlen, hdim}); + return ck_tile::HostTensor({batch, seqlen, nhead, hdim}); +} + +template +ck_tile::HostTensor to_bhsd(const ck_tile::HostTensor& tensor, bool is_bhsd) +{ + auto lens = tensor.get_lengths(); + ck_tile::index_t batch = lens[0]; + ck_tile::index_t seqlen = is_bhsd ? lens[2] : lens[1]; + ck_tile::index_t nhead = is_bhsd ? lens[1] : lens[2]; + ck_tile::index_t hdim = lens[3]; + + ck_tile::HostTensor out({batch, nhead, seqlen, hdim}); + for(ck_tile::index_t b = 0; b < batch; ++b) + for(ck_tile::index_t h = 0; h < nhead; ++h) + for(ck_tile::index_t s = 0; s < seqlen; ++s) + for(ck_tile::index_t d = 0; d < hdim; ++d) + out(b, h, s, d) = is_bhsd ? tensor(b, h, s, d) : tensor(b, s, h, d); + return out; +} + +template +auto get_error_tolerance() +{ + double rtol = 1e-2; + double atol = 4e-2; + if constexpr(std::is_same_v) + { + atol = 2e-1; + rtol = 2e-1; + } + return ck_tile::make_tuple(rtol, atol); +} + +template +float to_float_for_compare(T value) +{ + return static_cast(value); +} + +template <> +float to_float_for_compare(ck_tile::bf16_t value) +{ +#if CK_TILE_USE_CUSTOM_DATA_TYPE + return static_cast(value); +#else + return ck_tile::bf16_to_float_raw(ck_tile::bit_cast(value)); +#endif +} + +// ============================================================================ +// Arg parser +// ============================================================================ +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser + .insert("v", "1", "0:no validation, 1:cpu validation") + .insert("pipeline", "jenga", "attention pipeline: jenga / vsa") + .insert("b", "1", "batch size") + .insert("h", "4", "num of head for q") + .insert("h_k", "-1", "num of head for k/v, -1 means equal to h") + .insert("s", "4096", "seqlen_q") + .insert("s_k", "-1", "seqlen_k, -1 means equal to s") + .insert("d", "128", "head dim for q, k") + .insert("d_v", "-1", "head dim for v, -1 means equal to d") + .insert("topk", "0.3", "topk ratio for blockmap (fraction of K-blocks to keep)") + .insert("cdfthreshd", "-1", "CDF threshold for blockmap (overrides topk if >= 0)") + .insert("simthreshd1", "0.6", "similarity threshold for blockmap") + .insert("prec", "fp16", "data type: fp16/bf16") + .insert("iperm", "1", "permute input, 1: b*h*s*d, 0: b*s*h*d") + .insert("operm", "1", "permute output") + .insert("seed", "42", "random seed") + .insert("warmup", "5", "warmup iterations") + .insert("repeat", "20", "benchmark iterations") + .insert("kname", "0", "print kernel name"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +// ============================================================================ +// Main test +// ============================================================================ +template +bool run_test(const ck_tile::ArgParser& arg_parser) +{ + int do_validation = arg_parser.get_int("v"); + std::string pipeline = arg_parser.get_str("pipeline"); + ck_tile::index_t batch = arg_parser.get_int("b"); + ck_tile::index_t nhead = arg_parser.get_int("h"); + ck_tile::index_t nhead_k = arg_parser.get_int("h_k"); + ck_tile::index_t seqlen_q = arg_parser.get_int("s"); + ck_tile::index_t seqlen_k = arg_parser.get_int("s_k"); + ck_tile::index_t hdim_q = arg_parser.get_int("d"); + ck_tile::index_t hdim_v = arg_parser.get_int("d_v"); + float topk = arg_parser.get_float("topk"); + float cdfthreshd = arg_parser.get_float("cdfthreshd"); + float simthreshd1 = arg_parser.get_float("simthreshd1"); + bool i_perm = arg_parser.get_bool("iperm"); + bool o_perm = arg_parser.get_bool("operm"); + uint32_t seed = arg_parser.get_uint32("seed"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + int kname = arg_parser.get_int("kname"); + + if(nhead_k < 0) nhead_k = nhead; + if(seqlen_k < 0) seqlen_k = seqlen_q; + if(hdim_v < 0) hdim_v = hdim_q; + + // If cdfthreshd >= 0, use CDF mode; otherwise use topk mode + if(cdfthreshd >= 0.0f) + topk = -1.0f; + + constexpr ck_tile::index_t BLKQ = 64; + constexpr ck_tile::index_t BLKK = 128; + + if(hdim_q != 128 || hdim_v != 128) + { + std::cout << "\n>>> TEST SKIPPED <<<\n" + << "Kernel instances are generated for hdim=128 only.\n"; + return true; + } + + ck_tile::index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ; + ck_tile::index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK; + + std::string prec_str = std::is_same_v ? "fp16" : "bf16"; + std::cout << "[" << pipeline << "|" << prec_str + << "] b=" << batch << " h=" << nhead << " s=" << seqlen_q + << " d=" << hdim_q << " topk=" << topk + << " sim1=" << simthreshd1 << std::flush; + + // ---- allocate host tensors ---- + auto q_host = make_qkv_tensor(batch, nhead, seqlen_q, hdim_q, i_perm); + auto k_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_q, i_perm); + auto v_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_v, i_perm); + auto output_host = o_perm ? ck_tile::HostTensor({batch, nhead, seqlen_q, hdim_v}) + : ck_tile::HostTensor({batch, seqlen_q, nhead, hdim_v}); + + ck_tile::HostTensor block_map_host({batch, nhead, num_q_blocks, num_k_blocks}); + ck_tile::HostTensor lut_host({batch, nhead, num_q_blocks, num_k_blocks}); + ck_tile::HostTensor valid_block_num_host({batch, nhead, num_q_blocks}); + + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed}(q_host); + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 1}(k_host); + ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 2}(v_host); + + // ---- device tensors ---- + ck_tile::DeviceMem q_dev(q_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem k_dev(k_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem v_dev(v_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem o_dev(output_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem block_map_dev(block_map_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem lut_dev(lut_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem valid_bn_dev(valid_block_num_host.get_element_space_size_in_bytes()); + + q_dev.ToDevice(q_host.data()); + k_dev.ToDevice(k_host.data()); + v_dev.ToDevice(v_host.data()); + o_dev.SetZero(); + block_map_dev.SetZero(); + lut_dev.SetZero(); + valid_bn_dev.SetZero(); + + // ---- strides (BHSD when i_perm=true) ---- + auto q_strides = q_host.get_strides(); + auto k_strides = k_host.get_strides(); + auto v_strides = v_host.get_strides(); + auto o_strides = output_host.get_strides(); + + float scale_s = 1.0f / std::sqrt(static_cast(hdim_q)); + + // ---- build blockmap args ---- + sparge_blockmap_traits bmap_traits; + bmap_traits.data_type = std::is_same_v ? "fp16" : "bf16"; + bmap_traits.hdim_q = hdim_q; + + sparge_blockmap_args bmap_args; + bmap_args.q_ptr = q_dev.GetDeviceBuffer(); + bmap_args.k_ptr = k_dev.GetDeviceBuffer(); + bmap_args.batch = batch; + bmap_args.seqlen_q = seqlen_q; + bmap_args.seqlen_k = seqlen_k; + bmap_args.hdim_q = hdim_q; + bmap_args.nhead_q = nhead; + bmap_args.nhead_k = nhead_k; + bmap_args.stride_q = q_strides[i_perm ? 2 : 1]; + bmap_args.stride_k = k_strides[i_perm ? 2 : 1]; + bmap_args.nhead_stride_q = q_strides[i_perm ? 1 : 2]; + bmap_args.nhead_stride_k = k_strides[i_perm ? 1 : 2]; + bmap_args.batch_stride_q = q_strides[0]; + bmap_args.batch_stride_k = k_strides[0]; + bmap_args.simthreshd1 = simthreshd1; + bmap_args.cdfthreshd = (topk < 0.0f) ? cdfthreshd : -1.0f; + bmap_args.topk = topk; + bmap_args.scale = scale_s; + bmap_args.block_map_ptr = block_map_dev.GetDeviceBuffer(); + bmap_args.lut_ptr = (pipeline == "vsa") ? lut_dev.GetDeviceBuffer() : nullptr; + bmap_args.valid_block_num_ptr = (pipeline == "vsa") ? valid_bn_dev.GetDeviceBuffer() : nullptr; + + // ---- build attention args ---- + ck_tile::stream_config stream_cfg; + stream_cfg.stream_id_ = nullptr; + stream_cfg.time_kernel_ = true; + stream_cfg.log_level_ = kname; + stream_cfg.cold_niters_ = warmup; + stream_cfg.nrepeat_ = repeat; + + float avg_ms = -1.0f; + + if(pipeline == "jenga") + { + fmha_jenga_fwd_traits attn_traits; + attn_traits.hdim_q = hdim_q; + attn_traits.hdim_v = hdim_v; + attn_traits.data_type = std::is_same_v ? "fp16" : "bf16"; + attn_traits.is_v_rowmajor = true; + attn_traits.mask_type = mask_enum::no_mask; + + fmha_jenga_fwd_args attn_args; + attn_args.q_ptr = q_dev.GetDeviceBuffer(); + attn_args.k_ptr = k_dev.GetDeviceBuffer(); + attn_args.v_ptr = v_dev.GetDeviceBuffer(); + attn_args.block_relation_onehot_ptr = block_map_dev.GetDeviceBuffer(); + attn_args.o_ptr = o_dev.GetDeviceBuffer(); + attn_args.seqlen_q = seqlen_q; + attn_args.seqlen_k = seqlen_k; + attn_args.batch = batch; + attn_args.max_seqlen_q = seqlen_q; + attn_args.hdim_q = hdim_q; + attn_args.hdim_v = hdim_v; + attn_args.nhead_q = nhead; + attn_args.nhead_k = nhead_k; + attn_args.scale_s = scale_s; + attn_args.stride_q = q_strides[i_perm ? 2 : 1]; + attn_args.stride_k = k_strides[i_perm ? 2 : 1]; + attn_args.stride_v = v_strides[i_perm ? 2 : 1]; + attn_args.stride_o = o_strides[o_perm ? 2 : 1]; + attn_args.nhead_stride_q = q_strides[i_perm ? 1 : 2]; + attn_args.nhead_stride_k = k_strides[i_perm ? 1 : 2]; + attn_args.nhead_stride_v = v_strides[i_perm ? 1 : 2]; + attn_args.nhead_stride_o = o_strides[o_perm ? 1 : 2]; + attn_args.batch_stride_q = q_strides[0]; + attn_args.batch_stride_k = k_strides[0]; + attn_args.batch_stride_v = v_strides[0]; + attn_args.batch_stride_o = o_strides[0]; + attn_args.window_size_left = -1; + attn_args.window_size_right = -1; + attn_args.mask_type = 0; + + avg_ms = sparge_jenga_fwd(bmap_traits, bmap_args, attn_traits, attn_args, stream_cfg); + } + else if(pipeline == "vsa") + { + fmha_vsa_fwd_traits attn_traits; + attn_traits.hdim_q = hdim_q; + attn_traits.hdim_v = hdim_v; + attn_traits.data_type = std::is_same_v ? "fp16" : "bf16"; + attn_traits.is_v_rowmajor = true; + attn_traits.mask_type = mask_enum::no_mask; + + fmha_vsa_fwd_args attn_args; + attn_args.q_ptr = q_dev.GetDeviceBuffer(); + attn_args.k_ptr = k_dev.GetDeviceBuffer(); + attn_args.v_ptr = v_dev.GetDeviceBuffer(); + attn_args.lut_ptr = lut_dev.GetDeviceBuffer(); + attn_args.valid_block_num_ptr = valid_bn_dev.GetDeviceBuffer(); + attn_args.o_ptr = o_dev.GetDeviceBuffer(); + attn_args.seqlen_q = seqlen_q; + attn_args.seqlen_k = seqlen_k; + attn_args.batch = batch; + attn_args.max_seqlen_q = seqlen_q; + attn_args.hdim_q = hdim_q; + attn_args.hdim_v = hdim_v; + attn_args.nhead_q = nhead; + attn_args.nhead_k = nhead_k; + attn_args.scale_s = scale_s; + attn_args.stride_q = q_strides[i_perm ? 2 : 1]; + attn_args.stride_k = k_strides[i_perm ? 2 : 1]; + attn_args.stride_v = v_strides[i_perm ? 2 : 1]; + attn_args.stride_o = o_strides[o_perm ? 2 : 1]; + attn_args.nhead_stride_q = q_strides[i_perm ? 1 : 2]; + attn_args.nhead_stride_k = k_strides[i_perm ? 1 : 2]; + attn_args.nhead_stride_v = v_strides[i_perm ? 1 : 2]; + attn_args.nhead_stride_o = o_strides[o_perm ? 1 : 2]; + attn_args.batch_stride_q = q_strides[0]; + attn_args.batch_stride_k = k_strides[0]; + attn_args.batch_stride_v = v_strides[0]; + attn_args.batch_stride_o = o_strides[0]; + attn_args.window_size_left = -1; + attn_args.window_size_right = -1; + attn_args.mask_type = 0; + + avg_ms = sparge_vsa_fwd_combined(bmap_traits, bmap_args, attn_traits, attn_args, stream_cfg); + } + else + { + std::cerr << "Unknown pipeline: " << pipeline << " (use jenga or vsa)\n"; + return false; + } + + // ---- TFLOPS calculation (dense FMHA formula, so sparsity gains show as higher TFLOPS) ---- + std::size_t flop = static_cast(batch) * nhead * + (static_cast(2) * seqlen_q * seqlen_k * hdim_q + + static_cast(2) * seqlen_q * seqlen_k * hdim_v); + float tflops = (avg_ms > 0.f) ? static_cast(flop) / 1.E9f / avg_ms : 0.f; + + if(avg_ms > 0.f) + { + std::cout << std::fixed << ", " << std::setprecision(3) << avg_ms << " ms, " + << std::setprecision(2) << tflops << " TFlops" << std::flush; + } + + // ---- copy results back ---- + o_dev.FromDevice(output_host.data()); + block_map_dev.FromDevice(block_map_host.data()); + + // ---- count active blocks ---- + ck_tile::index_t total_blocks = batch * nhead * num_q_blocks * num_k_blocks; + ck_tile::index_t active_blocks = 0; + for(size_t i = 0; i < block_map_host.mData.size(); ++i) + if(block_map_host.mData[i]) + active_blocks++; + float actual_sparsity = 1.0f - static_cast(active_blocks) / static_cast(total_blocks); + std::cout << ", sparsity=" << std::setprecision(2) << actual_sparsity + << "(" << active_blocks << "/" << total_blocks << ")" << std::flush; + + // ---- validation ---- + bool pass = true; + if(do_validation) + { + auto q_ref = to_bhsd(q_host, i_perm); + auto k_ref = to_bhsd(k_host, i_perm); + auto v_ref = to_bhsd(v_host, i_perm); + + ck_tile::HostTensor output_ref({batch, nhead, seqlen_q, hdim_v}); + ck_tile::reference_blocked_attention( + q_ref, k_ref, v_ref, block_map_host, output_ref, BLKQ, BLKK, scale_s); + + auto [rtol, atol] = get_error_tolerance(); + + float max_diff = 0.0f; + size_t num_errors = 0; + + auto output_host_bhsd = to_bhsd(output_host, o_perm); + for(size_t i = 0; i < output_host_bhsd.mData.size(); ++i) + { + float gpu_val = to_float_for_compare(output_host_bhsd.mData[i]); + float ref_val = to_float_for_compare(output_ref.mData[i]); + float diff = std::abs(gpu_val - ref_val); + float rel_diff = (std::abs(ref_val) > 1e-6f) ? diff / std::abs(ref_val) : diff; + + max_diff = std::max(max_diff, diff); + + if(diff > atol && rel_diff > rtol) + num_errors++; + } + + pass = (num_errors == 0); + std::cout << ", " << (pass ? "PASS" : "FAIL") + << "(err=" << num_errors << "/" << output_host_bhsd.mData.size() + << " maxdiff=" << max_diff << ")"; + } + + std::cout << std::endl; + return pass; +} + +// ============================================================================ +// Main +// ============================================================================ +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + { + std::cerr << "Failed to parse arguments\n"; + return -1; + } + + std::string prec = arg_parser.get_str("prec"); + + bool test_result = false; + if(prec == "fp16") + { + test_result = run_test(arg_parser); + } + else if(prec == "bf16") + { + test_result = run_test(arg_parser); + } + else + { + std::cerr << "Unsupported precision: " << prec << "\n"; + return -1; + } + + return test_result ? 0 : -1; +} diff --git a/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp b/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp deleted file mode 100644 index 590e51db144..00000000000 --- a/example/ck_tile/50_sparse_attn/test_sparge_jenga_sparse_attn.cpp +++ /dev/null @@ -1,422 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -// Demo: Sparge block-map -> Jenga sparse attention - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ck_tile/host.hpp" -#include "ck_tile/core.hpp" -#include "ck_tile/host/reference/reference_blocked_attention.hpp" -#include "ck_tile/core/utility/bit_cast.hpp" - -#include "jenga_sparge_attention.h" -#include "sparge_tool.hpp" - -// ============================================================================ -// Helper Functions -// ============================================================================ - -template -ck_tile::HostTensor make_qkv_tensor(ck_tile::index_t batch, - ck_tile::index_t nhead, - ck_tile::index_t seqlen, - ck_tile::index_t hdim, - bool i_perm) -{ - if(i_perm) - { - return ck_tile::HostTensor({batch, nhead, seqlen, hdim}); - } - return ck_tile::HostTensor({batch, seqlen, nhead, hdim}); -} - -template -ck_tile::HostTensor to_bhsd(const ck_tile::HostTensor& tensor, bool is_bhsd) -{ - auto lens = tensor.get_lengths(); - ck_tile::index_t batch = lens[0]; - ck_tile::index_t seqlen = is_bhsd ? lens[2] : lens[1]; - ck_tile::index_t nhead = is_bhsd ? lens[1] : lens[2]; - ck_tile::index_t hdim = lens[3]; - - ck_tile::HostTensor out({batch, nhead, seqlen, hdim}); - for(ck_tile::index_t b = 0; b < batch; ++b) - { - for(ck_tile::index_t h = 0; h < nhead; ++h) - { - for(ck_tile::index_t s = 0; s < seqlen; ++s) - { - for(ck_tile::index_t d = 0; d < hdim; ++d) - { - out(b, h, s, d) = is_bhsd ? tensor(b, h, s, d) : tensor(b, s, h, d); - } - } - } - } - return out; -} - -template -auto get_error_tolerance() -{ - double rtol = 1e-2; - double atol = 4e-2; - if constexpr(std::is_same_v) - { - atol = 2e-1; - rtol = 2e-1; - } - return ck_tile::make_tuple(rtol, atol); -} - -template -float to_float_for_compare(T value) -{ - return static_cast(value); -} - -template <> -float to_float_for_compare(ck_tile::bf16_t value) -{ -#if CK_TILE_USE_CUSTOM_DATA_TYPE - return static_cast(value); -#else - return ck_tile::bf16_to_float_raw(ck_tile::bit_cast(value)); -#endif -} - -// ============================================================================ -// Command line argument parser -// ============================================================================ - -auto create_args(int argc, char* argv[]) -{ - ck_tile::ArgParser arg_parser; - arg_parser.insert("v", "1", "0:no validation, 1:cpu validation") - .insert("b", "1", "batch size") - .insert("h", "4", "num of head for q") - .insert("h_k", "-1", "num of head for k/v, -1 means equal to h") - .insert("s", "4096", "seqlen_q") - .insert("s_k", "-1", "seqlen_k, -1 means equal to s") - .insert("d", "128", "head dim for q, k") - .insert("d_v", "-1", "head dim for v, -1 means equal to d") - .insert("prec", "fp16", "data type: fp16/bf16") - .insert("iperm", "1", "permute input, 1: b*h*s*d, 0: b*s*h*d") - .insert("operm", "1", "permute output") - .insert("seed", "42", "random seed") - .insert("warmup", "5", "warmup iterations") - .insert("repeat", "20", "benchmark iterations") - .insert("kname", "0", "print kernel name") - // Sparge-specific - .insert("blkq", "64", "Sparge BLKQ") - .insert("blkk", "128", "Sparge BLKK") - .insert("simthreshd1", "0.6", "Sparge sim threshold") - .insert("cdfthreshd", "0.98", "Sparge CDF threshold (used when topk < 0)") - .insert("topk", "-1.0", "Sparge topk ratio in (0,1]; if > 0, overrides cdfthreshd"); - - bool result = arg_parser.parse(argc, argv); - return std::make_tuple(result, arg_parser); -} - -// ============================================================================ -// Main Test Function -// ============================================================================ - -template -bool run_test(const ck_tile::ArgParser& arg_parser) -{ - int do_validation = arg_parser.get_int("v"); - ck_tile::index_t batch = arg_parser.get_int("b"); - ck_tile::index_t nhead = arg_parser.get_int("h"); - ck_tile::index_t nhead_k = arg_parser.get_int("h_k"); - ck_tile::index_t seqlen_q = arg_parser.get_int("s"); - ck_tile::index_t seqlen_k = arg_parser.get_int("s_k"); - ck_tile::index_t hdim_q = arg_parser.get_int("d"); - ck_tile::index_t hdim_v = arg_parser.get_int("d_v"); - bool i_perm = arg_parser.get_bool("iperm"); - bool o_perm = arg_parser.get_bool("operm"); - uint32_t seed = arg_parser.get_uint32("seed"); - int warmup = arg_parser.get_int("warmup"); - int repeat = arg_parser.get_int("repeat"); - int kname = arg_parser.get_int("kname"); - - // Sparge params - ck_tile::index_t blkq = arg_parser.get_int("blkq"); - ck_tile::index_t blkk = arg_parser.get_int("blkk"); - float simthreshd1 = arg_parser.get_float("simthreshd1"); - float cdfthreshd = arg_parser.get_float("cdfthreshd"); - float topk = arg_parser.get_float("topk"); - - if(nhead_k < 0) - nhead_k = nhead; - if(seqlen_k < 0) - seqlen_k = seqlen_q; - if(hdim_v < 0) - hdim_v = hdim_q; - - if(blkq != 64 || blkk != 128 || hdim_q != 128 || hdim_v != 128) - { - std::cout << "\n>>> TEST SKIPPED <<<" << std::endl; - std::cout << "Sparge Jenga kernel instances are generated for BLKQ=64, BLKK=128, " - "hdim_q=128, hdim_v=128 only." - << std::endl; - std::cout << "TEST SKIPPED" << std::endl; - return true; - } - - ck_tile::index_t BLKQ = blkq; - ck_tile::index_t BLKK = blkk; - - ck_tile::index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ; - ck_tile::index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK; - - std::cout << "============================================================" << std::endl; - std::cout << "[Sparge -> Jenga Sparse Attention Demo]" << std::endl; - std::cout << "============================================================" << std::endl; - std::cout << " Batch: " << batch << ", nhead_q: " << nhead << ", nhead_k: " << nhead_k - << std::endl; - std::cout << " seqlen_q: " << seqlen_q << ", seqlen_k: " << seqlen_k << std::endl; - std::cout << " hdim_q: " << hdim_q << ", hdim_v: " << hdim_v << std::endl; - std::cout << " BLKQ=" << BLKQ << ", BLKK=" << BLKK << std::endl; - std::cout << " num_q_blocks: " << num_q_blocks << ", num_k_blocks: " << num_k_blocks - << std::endl; - std::cout << " Sparge(simthreshd1=" << simthreshd1 << ", cdfthreshd=" << cdfthreshd - << ", topk=" << topk << ")" << std::endl; - std::cout << " i_perm: " << i_perm << ", o_perm: " << o_perm << std::endl; - - // Create host tensors - ck_tile::HostTensor q_host = make_qkv_tensor(batch, nhead, seqlen_q, hdim_q, i_perm); - ck_tile::HostTensor k_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_q, i_perm); - ck_tile::HostTensor v_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_v, i_perm); - ck_tile::HostTensor output_host = - o_perm ? ck_tile::HostTensor({batch, nhead, seqlen_q, hdim_v}) - : ck_tile::HostTensor({batch, seqlen_q, nhead, hdim_v}); - ck_tile::HostTensor output_ref({batch, nhead, seqlen_q, hdim_v}); - - std::cout << "\nInitializing tensors..." << std::endl; - ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed}(q_host); - ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 1}(k_host); - ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 2}(v_host); - - // Build block map using Sparge tool - std::cout << "Building Sparge block map..." << std::endl; - sparge::SpargeParams p; - p.BLKQ = static_cast(BLKQ); - p.BLKK = static_cast(BLKK); - p.simthreshd1 = simthreshd1; - p.cdfthreshd = cdfthreshd; - p.topk = topk; - p.i_perm = i_perm; - - ck_tile::HostTensor block_relation_onehot = - sparge::build_block_map_meansim(q_host, k_host, p); - - // Print actual sparsity - std::size_t total_blocks = 0; - std::size_t active_blocks = 0; - for(ck_tile::index_t b = 0; b < batch; ++b) - { - for(ck_tile::index_t h = 0; h < nhead; ++h) - { - for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) - { - for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) - { - total_blocks++; - if(block_relation_onehot(b, h, qb, kb) != 0) - active_blocks++; - } - } - } - } - float actual_sparsity = - 1.0f - static_cast(active_blocks) / static_cast(total_blocks); - std::cout << " Actual sparsity: " << actual_sparsity << " (" << active_blocks << "/" - << total_blocks << " blocks active)" << std::endl; - - std::cout << "\n--- Running Jenga sparse attention kernel ---" << std::endl; - - try - { - if(kname) - { - jenga_sparge_attention(q_host, - k_host, - v_host, - block_relation_onehot, - output_host, - batch, - nhead, - nhead_k, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - i_perm, - o_perm, - seqlen_q, - seqlen_k, - 1); - } - - for(int i = 0; i < warmup; ++i) - { - jenga_sparge_attention(q_host, - k_host, - v_host, - block_relation_onehot, - output_host, - batch, - nhead, - nhead_k, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - i_perm, - o_perm, - seqlen_q, - seqlen_k, - 0); - } - - [[maybe_unused]] auto sync_status1 = hipDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - - for(int i = 0; i < repeat; ++i) - { - jenga_sparge_attention(q_host, - k_host, - v_host, - block_relation_onehot, - output_host, - batch, - nhead, - nhead_k, - seqlen_q, - seqlen_k, - hdim_q, - hdim_v, - i_perm, - o_perm, - seqlen_q, - seqlen_k, - 0); - } - - [[maybe_unused]] auto sync_status2 = hipDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - double avg_time_ms = - std::chrono::duration(end - start).count() / repeat; - - std::cout << "\n>>>> Jenga sparse attention average time: " << avg_time_ms << " ms <<<<" - << std::endl; - } - catch(const std::exception& e) - { - std::cerr << "Error during kernel execution: " << e.what() << std::endl; - return false; - } - - bool pass = true; - if(do_validation) - { - std::cout << "\n--- Performing CPU validation ---" << std::endl; - float scale = 1.0f / std::sqrt(static_cast(hdim_q)); - - std::cout << "Computing reference output..." << std::endl; - auto q_ref = to_bhsd(q_host, i_perm); - auto k_ref = to_bhsd(k_host, i_perm); - auto v_ref = to_bhsd(v_host, i_perm); - - ck_tile::reference_blocked_attention( - q_ref, k_ref, v_ref, block_relation_onehot, output_ref, BLKQ, BLKK, scale); - - auto [rtol, atol] = get_error_tolerance(); - - float max_diff = 0.0f; - float max_rel_diff = 0.0f; - std::size_t num_errors = 0; - - auto output_host_bhsd = to_bhsd(output_host, o_perm); - for(std::size_t i = 0; i < output_host_bhsd.mData.size(); ++i) - { - float gpu_val = to_float_for_compare(output_host_bhsd.mData[i]); - float ref_val = to_float_for_compare(output_ref.mData[i]); - float diff = std::abs(gpu_val - ref_val); - float rel_diff = (std::abs(ref_val) > 1e-6f) ? diff / std::abs(ref_val) : diff; - - max_diff = std::max(max_diff, diff); - max_rel_diff = std::max(max_rel_diff, rel_diff); - - if(diff > atol && rel_diff > rtol) - { - num_errors++; - if(num_errors <= 5) - { - std::cout << " Mismatch at index " << i << ": GPU=" << gpu_val - << ", Ref=" << ref_val << ", Diff=" << diff << std::endl; - } - } - } - - std::cout << "\nValidation results:" << std::endl; - std::cout << " Max absolute difference: " << max_diff << std::endl; - std::cout << " Max relative difference: " << max_rel_diff << std::endl; - std::cout << " Number of mismatches: " << num_errors << " / " - << output_host_bhsd.mData.size() << std::endl; - - if(num_errors == 0) - { - std::cout << "\n>>> VALIDATION PASSED <<<" << std::endl; - } - else - { - std::cout << "\n>>> VALIDATION FAILED <<<" << std::endl; - pass = false; - } - } - - std::cout << "\n" << (pass ? "TEST PASSED" : "TEST FAILED") << std::endl; - return pass; -} - -// ============================================================================ -// Main -// ============================================================================ - -int main(int argc, char* argv[]) -{ - auto [result, arg_parser] = create_args(argc, argv); - if(!result) - { - std::cerr << "Failed to parse arguments" << std::endl; - return -1; - } - - std::string prec = arg_parser.get_str("prec"); - - bool test_result = false; - if(prec == "fp16") - { - test_result = run_test(arg_parser); - } - else if(prec == "bf16") - { - test_result = run_test(arg_parser); - } - else - { - std::cerr << "Unsupported precision: " << prec << std::endl; - return -1; - } - - return test_result ? 0 : -1; -} diff --git a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp b/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp deleted file mode 100644 index 572b708f9ef..00000000000 --- a/example/ck_tile/50_sparse_attn/test_sparge_vsa_sparse_attn.cpp +++ /dev/null @@ -1,597 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT -// Demo: Sparge block-map -> (delta LUT) -> VSA sparse attention (all-in-device) - -#include -#include -#include -#include "ck_tile/host.hpp" -#include "ck_tile/core.hpp" -#include "ck_tile/host/reference/reference_blocked_attention.hpp" -#include "ck_tile/core/utility/bit_cast.hpp" - -#include "sparge_blockmap_trek.hpp" -#include "fmha_fwd_trek.hpp" -#include "sparge_tool.hpp" - -// ============================================================================ -// Helper Functions -// ============================================================================ - -template -ck_tile::HostTensor make_qkv_tensor(ck_tile::index_t batch, - ck_tile::index_t nhead, - ck_tile::index_t seqlen, - ck_tile::index_t hdim, - bool i_perm) -{ - if(i_perm) - { - return ck_tile::HostTensor({batch, nhead, seqlen, hdim}); - } - return ck_tile::HostTensor({batch, seqlen, nhead, hdim}); -} - -template -ck_tile::HostTensor to_bhsd(const ck_tile::HostTensor& tensor, bool is_bhsd) -{ - auto lens = tensor.get_lengths(); - ck_tile::index_t batch = lens[0]; - ck_tile::index_t seqlen = is_bhsd ? lens[2] : lens[1]; - ck_tile::index_t nhead = is_bhsd ? lens[1] : lens[2]; - ck_tile::index_t hdim = lens[3]; - - ck_tile::HostTensor out({batch, nhead, seqlen, hdim}); - for(ck_tile::index_t b = 0; b < batch; ++b) - { - for(ck_tile::index_t h = 0; h < nhead; ++h) - { - for(ck_tile::index_t s = 0; s < seqlen; ++s) - { - for(ck_tile::index_t d = 0; d < hdim; ++d) - { - out(b, h, s, d) = is_bhsd ? tensor(b, h, s, d) : tensor(b, s, h, d); - } - } - } - } - return out; -} - -template -auto get_error_tolerance() -{ - double rtol = 1e-2; - double atol = 4e-2; - if constexpr(std::is_same_v) - { - atol = 2e-1; - rtol = 2e-1; - } - return ck_tile::make_tuple(rtol, atol); -} - -template -float to_float_for_compare(T value) -{ - return static_cast(value); -} - -template <> -float to_float_for_compare(ck_tile::bf16_t value) -{ -#if CK_TILE_USE_CUSTOM_DATA_TYPE - return static_cast(value); -#else - return ck_tile::bf16_to_float_raw(ck_tile::bit_cast(value)); -#endif -} - -// ============================================================================ -// Command line argument parser -// ============================================================================ - -auto create_args(int argc, char* argv[]) -{ - ck_tile::ArgParser arg_parser; - arg_parser.insert("v", "1", "0:no validation, 1:cpu validation") - .insert("b", "1", "batch size") - .insert("h", "4", "num of head for q") - .insert("h_k", "-1", "num of head for k/v, -1 means equal to h") - .insert("s", "4096", "seqlen_q") - .insert("s_k", "-1", "seqlen_k, -1 means equal to s") - .insert("d", "128", "head dim for q, k") - .insert("d_v", "-1", "head dim for v, -1 means equal to d") - .insert("prec", "fp16", "data type: fp16/bf16") - .insert("iperm", "1", "permute input, 1: b*h*s*d, 0: b*s*h*d") - .insert("operm", "1", "permute output") - .insert("seed", "42", "random seed") - .insert("warmup", "5", "warmup iterations") - .insert("repeat", "20", "benchmark iterations") - .insert("kname", "0", "print kernel name") - // Sparge-specific - .insert("blkq", "64", "Sparge BLKQ") - .insert("blkk", "128", "Sparge BLKK") - .insert("simthreshd1", "0.6", "Sparge sim threshold") - .insert("cdfthreshd", "0.98", "Sparge CDF threshold (used when topk < 0)") - .insert("topk", "-1.0", "Sparge topk ratio in (0,1]; if > 0, overrides cdfthreshd"); - - bool result = arg_parser.parse(argc, argv); - return std::make_tuple(result, arg_parser); -} - -// ============================================================================ -// Main Test Function -// ============================================================================ - -template -bool run_test(const ck_tile::ArgParser& arg_parser) -{ - int do_validation = arg_parser.get_int("v"); - ck_tile::index_t batch = arg_parser.get_int("b"); - ck_tile::index_t nhead = arg_parser.get_int("h"); - ck_tile::index_t nhead_k = arg_parser.get_int("h_k"); - ck_tile::index_t seqlen_q = arg_parser.get_int("s"); - ck_tile::index_t seqlen_k = arg_parser.get_int("s_k"); - ck_tile::index_t hdim_q = arg_parser.get_int("d"); - ck_tile::index_t hdim_v = arg_parser.get_int("d_v"); - bool i_perm = arg_parser.get_bool("iperm"); - bool o_perm = arg_parser.get_bool("operm"); - uint32_t seed = arg_parser.get_uint32("seed"); - int warmup = arg_parser.get_int("warmup"); - int repeat = arg_parser.get_int("repeat"); - int kname = arg_parser.get_int("kname"); - - // Sparge params - ck_tile::index_t blkq = arg_parser.get_int("blkq"); - ck_tile::index_t blkk = arg_parser.get_int("blkk"); - float simthreshd1 = arg_parser.get_float("simthreshd1"); - float cdfthreshd = arg_parser.get_float("cdfthreshd"); - float topk = arg_parser.get_float("topk"); - - if(nhead_k < 0) - nhead_k = nhead; - if(seqlen_k < 0) - seqlen_k = seqlen_q; - if(hdim_v < 0) - hdim_v = hdim_q; - - if(blkq != 64 || blkk != 128 || hdim_q != 128 || hdim_v != 128) - { - std::cout << "\n>>> TEST SKIPPED <<<" << std::endl; - std::cout << "Sparge VSA kernel instances are generated for BLKQ=64, BLKK=128, " - "hdim_q=128, hdim_v=128 only." - << std::endl; - std::cout << "TEST SKIPPED" << std::endl; - return true; - } - - ck_tile::index_t BLKQ = blkq; - ck_tile::index_t BLKK = blkk; - - ck_tile::index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ; - ck_tile::index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK; - - std::cout << "============================================================" << std::endl; - std::cout << "[Sparge -> VSA Sparse Attention Demo]" << std::endl; - std::cout << "============================================================" << std::endl; - std::cout << " Batch: " << batch << ", nhead_q: " << nhead << ", nhead_k: " << nhead_k - << std::endl; - std::cout << " seqlen_q: " << seqlen_q << ", seqlen_k: " << seqlen_k << std::endl; - std::cout << " hdim_q: " << hdim_q << ", hdim_v: " << hdim_v << std::endl; - std::cout << " BLKQ=" << BLKQ << ", BLKK=" << BLKK << std::endl; - std::cout << " num_q_blocks: " << num_q_blocks << ", num_k_blocks: " << num_k_blocks - << std::endl; - std::cout << " Sparge(simthreshd1=" << simthreshd1 << ", cdfthreshd=" << cdfthreshd - << ", topk=" << topk << ")" << std::endl; - std::cout << " i_perm: " << i_perm << ", o_perm: " << o_perm << std::endl; - - // Create host tensors and fill with random data - ck_tile::HostTensor q_host = make_qkv_tensor(batch, nhead, seqlen_q, hdim_q, i_perm); - ck_tile::HostTensor k_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_q, i_perm); - ck_tile::HostTensor v_host = make_qkv_tensor(batch, nhead_k, seqlen_k, hdim_v, i_perm); - ck_tile::HostTensor output_host = - o_perm ? ck_tile::HostTensor({batch, nhead, seqlen_q, hdim_v}) - : ck_tile::HostTensor({batch, seqlen_q, nhead, hdim_v}); - - std::cout << "\nInitializing tensors..." << std::endl; - ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed}(q_host); - ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 1}(k_host); - ck_tile::FillUniformDistribution{-0.5f, 0.5f, seed + 2}(v_host); - - // ================================================================== - // Allocate device memory once, HtoD once - // ================================================================== - ck_tile::DeviceMem q_buf(q_host.get_element_space_size_in_bytes()); - ck_tile::DeviceMem k_buf(k_host.get_element_space_size_in_bytes()); - ck_tile::DeviceMem v_buf(v_host.get_element_space_size_in_bytes()); - ck_tile::DeviceMem o_buf(output_host.get_element_space_size_in_bytes()); - - q_buf.ToDevice(q_host.data()); - k_buf.ToDevice(k_host.data()); - v_buf.ToDevice(v_host.data()); - - const std::size_t bmap_bytes = - static_cast(batch) * nhead * num_q_blocks * num_k_blocks * sizeof(uint8_t); - const std::size_t lut_bytes = - static_cast(batch) * nhead * num_q_blocks * num_k_blocks * sizeof(int32_t); - const std::size_t valid_bytes = - static_cast(batch) * nhead * num_q_blocks * sizeof(int32_t); - - ck_tile::DeviceMem bmap_buf(bmap_bytes); - ck_tile::DeviceMem lut_buf(lut_bytes); - ck_tile::DeviceMem valid_buf(valid_bytes); - bmap_buf.SetZero(); - lut_buf.SetZero(); - valid_buf.SetZero(); - - // ================================================================== - // Common stride calculations - // ================================================================== - assert(nhead % nhead_k == 0); - const float scale_s = 1.0f / std::sqrt(static_cast(hdim_q)); - - const ck_tile::index_t stride_q = i_perm ? hdim_q : nhead * hdim_q; - const ck_tile::index_t stride_k = i_perm ? hdim_q : nhead_k * hdim_q; - const ck_tile::index_t stride_v = i_perm ? hdim_v : nhead_k * hdim_v; - const ck_tile::index_t stride_o = o_perm ? hdim_v : nhead * hdim_v; - const ck_tile::index_t nhead_stride_q = i_perm ? seqlen_q * hdim_q : hdim_q; - const ck_tile::index_t nhead_stride_k = i_perm ? seqlen_k * hdim_q : hdim_q; - const ck_tile::index_t nhead_stride_v = i_perm ? seqlen_k * hdim_v : hdim_v; - const ck_tile::index_t nhead_stride_o = o_perm ? seqlen_q * hdim_v : hdim_v; - const ck_tile::index_t batch_stride_q = nhead * seqlen_q * hdim_q; - const ck_tile::index_t batch_stride_k = nhead_k * seqlen_k * hdim_q; - const ck_tile::index_t batch_stride_v = nhead_k * hdim_v * seqlen_k; - const ck_tile::index_t batch_stride_o = nhead * seqlen_q * hdim_v; - - std::string data_type = "fp16"; - if constexpr(std::is_same_v) - data_type = "bf16"; - - std::string msk_str = "0"; - mask_info mask = mask_info::decode(msk_str, seqlen_q, seqlen_k); - - // ================================================================== - // GPU: Build block map + VSA LUT (always run, device-only) - // ================================================================== - std::cout << "Building Sparge block map + VSA LUT (GPU)..." << std::endl; - { - sparge_blockmap_args args; - args.q_ptr = q_buf.GetDeviceBuffer(); - args.k_ptr = k_buf.GetDeviceBuffer(); - args.batch = batch; - args.seqlen_q = seqlen_q; - args.seqlen_k = seqlen_k; - args.hdim_q = hdim_q; - args.nhead_q = nhead; - args.nhead_k = nhead_k; - args.stride_q = stride_q; - args.stride_k = stride_k; - args.nhead_stride_q = nhead_stride_q; - args.nhead_stride_k = nhead_stride_k; - args.batch_stride_q = batch_stride_q; - args.batch_stride_k = batch_stride_k; - args.simthreshd1 = simthreshd1; - args.cdfthreshd = cdfthreshd; - args.topk = topk; - args.scale = scale_s; - args.block_map_ptr = bmap_buf.GetDeviceBuffer(); - args.lut_ptr = lut_buf.GetDeviceBuffer(); - args.valid_block_num_ptr = valid_buf.GetDeviceBuffer(); - - sparge_blockmap_traits traits; - traits.data_type = data_type; - traits.hdim_q = hdim_q; - - sparge_blockmap_fwd(traits, args, ck_tile::stream_config{}); - } - - // ================================================================== - // VSA sparse attention kernel (always run, LUT stays on device) - // ================================================================== - std::cout << "\n--- Running VSA sparse attention kernel ---" << std::endl; - - fmha_vsa_fwd_args fmha_args; - fmha_args.q_ptr = q_buf.GetDeviceBuffer(); - fmha_args.k_ptr = k_buf.GetDeviceBuffer(); - fmha_args.v_ptr = v_buf.GetDeviceBuffer(); - fmha_args.lut_ptr = lut_buf.GetDeviceBuffer(); - fmha_args.valid_block_num_ptr = valid_buf.GetDeviceBuffer(); - fmha_args.o_ptr = o_buf.GetDeviceBuffer(); - fmha_args.batch = batch; - fmha_args.seqlen_q = seqlen_q; - fmha_args.seqlen_k = seqlen_k; - fmha_args.max_seqlen_q = seqlen_q; - fmha_args.hdim_q = hdim_q; - fmha_args.hdim_v = hdim_v; - fmha_args.nhead_q = nhead; - fmha_args.nhead_k = nhead_k; - fmha_args.scale_s = scale_s; - fmha_args.stride_q = stride_q; - fmha_args.stride_k = stride_k; - fmha_args.stride_v = stride_v; - fmha_args.stride_o = stride_o; - fmha_args.nhead_stride_q = nhead_stride_q; - fmha_args.nhead_stride_k = nhead_stride_k; - fmha_args.nhead_stride_v = nhead_stride_v; - fmha_args.nhead_stride_o = nhead_stride_o; - fmha_args.batch_stride_q = batch_stride_q; - fmha_args.batch_stride_k = batch_stride_k; - fmha_args.batch_stride_v = batch_stride_v; - fmha_args.batch_stride_o = batch_stride_o; - fmha_args.window_size_left = mask.left; - fmha_args.window_size_right = mask.right; - fmha_args.mask_type = static_cast(mask.type); - - fmha_vsa_fwd_traits fmha_traits; - fmha_traits.hdim_q = hdim_q; - fmha_traits.hdim_v = hdim_v; - fmha_traits.data_type = data_type; - fmha_traits.is_v_rowmajor = true; - fmha_traits.mask_type = mask.type; - - ck_tile::stream_config stream_config{nullptr, - true, - /* log_level = */ kname ? 1 : 0, - warmup, - repeat, - false}; - - float avg_time_ms = sparge_vsa_fwd(fmha_traits, fmha_args, stream_config); - - std::cout << "\n>>>> VSA sparse attention average time: " << avg_time_ms << " ms <<<<" - << std::endl; - - // DtoH: attention output (always needed) - o_buf.FromDevice(output_host.data(), output_host.get_element_space_size_in_bytes()); - - // DtoH: block_map (needed for sparsity stats and validation) - ck_tile::HostTensor block_map_gpu({batch, nhead, num_q_blocks, num_k_blocks}); - bmap_buf.FromDevice(block_map_gpu.data(), bmap_bytes); - - // ================================================================== - // Sparsity statistics (pure CPU, reads block_map HostTensor) - // ================================================================== - std::size_t total_blocks = 0; - std::size_t active_blocks = 0; - for(ck_tile::index_t b = 0; b < batch; ++b) - { - for(ck_tile::index_t h = 0; h < nhead; ++h) - { - for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) - { - for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) - { - total_blocks++; - if(block_map_gpu(b, h, qb, kb) != 0) - active_blocks++; - } - } - } - } - float actual_sparsity = - 1.0f - static_cast(active_blocks) / static_cast(total_blocks); - std::cout << "\n Actual sparsity: " << actual_sparsity << " (" << active_blocks << "/" - << total_blocks << " blocks active)" << std::endl; - - // ================================================================== - // Validation (only when -v=1) - // ================================================================== - bool pass = true; - if(do_validation) - { - std::cout << "\n--- Performing CPU validation ---" << std::endl; - - // CPU golden: block map + VSA LUT - std::cout << "Building Sparge block map (CPU golden)..." << std::endl; - sparge::SpargeParams p; - p.BLKQ = static_cast(BLKQ); - p.BLKK = static_cast(BLKK); - p.simthreshd1 = simthreshd1; - p.cdfthreshd = cdfthreshd; - p.topk = topk; - p.i_perm = i_perm; - - ck_tile::HostTensor block_relation_onehot = - sparge::build_block_map_meansim(q_host, k_host, p); - - std::cout << "Converting block map to VSA LUT (delta, CPU)..." << std::endl; - auto vsa_lut_cpu = sparge::block_map_to_vsa_lut_delta(block_relation_onehot); - - // DtoH: LUT + valid_block_num (only for validation) - sparge::VSALut vsa_lut_gpu{ - ck_tile::HostTensor({batch, nhead, num_q_blocks, num_k_blocks}), - ck_tile::HostTensor({batch, nhead, num_q_blocks}), - }; - lut_buf.FromDevice(vsa_lut_gpu.lut.data(), lut_bytes); - valid_buf.FromDevice(vsa_lut_gpu.valid_block_num.data(), valid_bytes); - - // Validate block map - std::cout << "\n--- Validating GPU block map vs CPU golden ---" << std::endl; - { - std::size_t bmap_mismatches = 0; - for(ck_tile::index_t b = 0; b < batch; ++b) - { - for(ck_tile::index_t h = 0; h < nhead; ++h) - { - for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) - { - for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) - { - if(block_map_gpu(b, h, qb, kb) != block_relation_onehot(b, h, qb, kb)) - { - bmap_mismatches++; - if(bmap_mismatches <= 10) - { - std::cout - << " block_map mismatch at [" << b << "," << h << "," << qb - << "," << kb << "]: GPU=" - << static_cast(block_map_gpu(b, h, qb, kb)) << " CPU=" - << static_cast(block_relation_onehot(b, h, qb, kb)) - << std::endl; - } - } - } - } - } - } - std::cout << " Block map mismatches: " << bmap_mismatches << " / " - << (batch * nhead * num_q_blocks * num_k_blocks) << std::endl; - if(bmap_mismatches > 0) - { - std::cout << ">>> GPU BLOCK MAP VALIDATION FAILED <<<" << std::endl; - pass = false; - } - else - { - std::cout << ">>> GPU BLOCK MAP VALIDATION PASSED <<<" << std::endl; - } - } - - // Validate VSA LUT - std::cout << "\n--- Validating GPU VSA LUT vs CPU golden ---" << std::endl; - { - std::size_t lut_mismatches = 0; - std::size_t valid_mismatches = 0; - for(ck_tile::index_t b = 0; b < batch; ++b) - { - for(ck_tile::index_t h = 0; h < nhead; ++h) - { - for(ck_tile::index_t qb = 0; qb < num_q_blocks; ++qb) - { - if(vsa_lut_gpu.valid_block_num(b, h, qb) != - vsa_lut_cpu.valid_block_num(b, h, qb)) - { - valid_mismatches++; - if(valid_mismatches <= 5) - { - std::cout << " valid_block_num mismatch at [" << b << "," << h - << "," << qb - << "]: GPU=" << vsa_lut_gpu.valid_block_num(b, h, qb) - << " CPU=" << vsa_lut_cpu.valid_block_num(b, h, qb) - << std::endl; - } - } - for(ck_tile::index_t kb = 0; kb < num_k_blocks; ++kb) - { - if(vsa_lut_gpu.lut(b, h, qb, kb) != vsa_lut_cpu.lut(b, h, qb, kb)) - { - lut_mismatches++; - if(lut_mismatches <= 10) - { - std::cout - << " LUT mismatch at [" << b << "," << h << "," << qb - << "," << kb << "]: GPU=" << vsa_lut_gpu.lut(b, h, qb, kb) - << " CPU=" << vsa_lut_cpu.lut(b, h, qb, kb) << std::endl; - } - } - } - } - } - } - std::cout << " LUT mismatches: " << lut_mismatches << std::endl; - std::cout << " valid_block_num mismatches: " << valid_mismatches << std::endl; - if(lut_mismatches == 0 && valid_mismatches == 0) - { - std::cout << ">>> GPU VSA LUT VALIDATION PASSED <<<" << std::endl; - } - else - { - std::cout << ">>> GPU VSA LUT VALIDATION FAILED <<<" << std::endl; - pass = false; - } - } - - // Validate attention output - float scale = 1.0f / std::sqrt(static_cast(hdim_q)); - - std::cout << "\nComputing reference attention output..." << std::endl; - auto q_ref = to_bhsd(q_host, i_perm); - auto k_ref = to_bhsd(k_host, i_perm); - auto v_ref = to_bhsd(v_host, i_perm); - - ck_tile::HostTensor output_ref({batch, nhead, seqlen_q, hdim_v}); - ck_tile::reference_blocked_attention( - q_ref, k_ref, v_ref, block_relation_onehot, output_ref, BLKQ, BLKK, scale); - - auto [rtol, atol] = get_error_tolerance(); - - float max_diff = 0.0f; - float max_rel_diff = 0.0f; - std::size_t num_errors = 0; - - auto output_host_bhsd = to_bhsd(output_host, o_perm); - for(std::size_t i = 0; i < output_host_bhsd.mData.size(); ++i) - { - float gpu_val = to_float_for_compare(output_host_bhsd.mData[i]); - float ref_val = to_float_for_compare(output_ref.mData[i]); - float diff = std::abs(gpu_val - ref_val); - float rel_diff = (std::abs(ref_val) > 1e-6f) ? diff / std::abs(ref_val) : diff; - - max_diff = std::max(max_diff, diff); - max_rel_diff = std::max(max_rel_diff, rel_diff); - - if(diff > atol && rel_diff > rtol) - { - num_errors++; - if(num_errors <= 5) - { - std::cout << " Mismatch at index " << i << ": GPU=" << gpu_val - << ", Ref=" << ref_val << ", Diff=" << diff << std::endl; - } - } - } - - std::cout << "\nAttention validation results:" << std::endl; - std::cout << " Max absolute difference: " << max_diff << std::endl; - std::cout << " Max relative difference: " << max_rel_diff << std::endl; - std::cout << " Number of mismatches: " << num_errors << " / " - << output_host_bhsd.mData.size() << std::endl; - - if(num_errors == 0) - { - std::cout << "\n>>> VALIDATION PASSED <<<" << std::endl; - } - else - { - std::cout << "\n>>> VALIDATION FAILED <<<" << std::endl; - pass = false; - } - } - - std::cout << "\n" << (pass ? "TEST PASSED" : "TEST FAILED") << std::endl; - return pass; -} - -// ============================================================================ -// Main -// ============================================================================ - -int main(int argc, char* argv[]) -{ - auto [result, arg_parser] = create_args(argc, argv); - if(!result) - { - std::cerr << "Failed to parse arguments" << std::endl; - return -1; - } - - std::string prec = arg_parser.get_str("prec"); - - bool test_result = false; - if(prec == "fp16") - { - test_result = run_test(arg_parser); - } - else if(prec == "bf16") - { - test_result = run_test(arg_parser); - } - else - { - std::cerr << "Unsupported precision: " << prec << std::endl; - return -1; - } - - return test_result ? 0 : -1; -} diff --git a/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp b/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp index 67936c4353f..9fe8b365b00 100644 --- a/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp +++ b/include/ck_tile/ops/sparse_attn/pipeline/block_fmha_pipeline_qr_ks_vs_async_jenga.hpp @@ -318,26 +318,26 @@ struct BlockFmhaPipelineQRKSVSAsyncJenga { if(!block_relation_onehot[i_total_loops]) { - i_total_loops++; - if(i_total_loops < num_total_loop) - { - // move K tile windows - move_tile_window(k_dram_block_window, {kN0, 0}); - k_dram_window.set_window_origin(k_dram_block_window.get_window_origin()); - - if(block_relation_onehot[i_total_loops]) - { - async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})), - k_dram_window, - number<-1>{}, - k_oob_ck, - k_pre_np); - } - move_tile_window(k_dram_window, {0, kK0}); - move_tile_window(v_dram_window, {0, kN0}); - continue; - } - break; + // scan-ahead: find the next active block in one shot + index_t next = i_total_loops + 1; + while(next < num_total_loop && !block_relation_onehot[next]) + next++; + if(next >= num_total_loop) + break; + const index_t delta = next - i_total_loops; + i_total_loops = next; + // jump K/V windows to the next active block + move_tile_window(k_dram_block_window, {kN0 * delta, 0}); + k_dram_window.set_window_origin(k_dram_block_window.get_window_origin()); + move_tile_window(v_dram_window, {0, kN0 * delta}); + // immediately prefetch the active K tile + async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})), + k_dram_window, + number<-1>{}, + k_oob_ck, + k_pre_np); + move_tile_window(k_dram_window, {0, kK0}); + continue; } // STAGE 1, QK gemm From eca3cb3e0abdcb927a02f9b7b9ed786b9a9cdda2 Mon Sep 17 00:00:00 2001 From: Gino Lu Date: Fri, 24 Apr 2026 05:13:51 -0400 Subject: [PATCH 6/7] sparse_attn: add bm0 dispatch for sparge blockmap compatibility Add bm0 field to fmha_jenga_fwd_traits so callers can specify the preferred Q-tile size. Codegen now emits separate tile configs for bm0=64 (sparge blockmap) and bm0=128 (original), with CppConstraint guards to select the right kernel at runtime. End-to-end test passes for both jenga and vsa paths. Performance is known to be suboptimal at this stage; tile sizes and warp counts for the bm0=64 path have not been tuned. Co-Authored-By: Claude Opus 4.7 --- .../codegen/ops/fmha_fwd_jenga.py | 58 ++++++++++--------- .../codegen/ops/fmha_fwd_vsa.py | 58 ++++++++++--------- .../ck_tile/50_sparse_attn/fmha_fwd_trek.hpp | 2 +- .../ck_tile/50_sparse_attn/test_sparge.cpp | 2 + 4 files changed, 63 insertions(+), 57 deletions(-) diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py index 1f0a78048d9..fc4b8642ddd 100644 --- a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py +++ b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_jenga.py @@ -690,12 +690,12 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: # FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], (128, 128): [ - FmhaFwdTileSize( # fmt: skip -- 64x128 tile matching blockmap kM0=64, kN0=128 - 64, + FmhaFwdTileSize( # fmt: skip -- 128x128 tile (original, for old sparse attn test) 128, - 64, 128, - 64, + 32, + 128, + 32, 128, 4, 1, @@ -703,13 +703,36 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: 4, 1, 1, + 32, + 32, 16, + 32, + 32, 16, + -1, + CppConstraint("t.bm0 == 0 || t.bm0 == 128"), + ), + FmhaFwdTileSize( # fmt: skip -- 64x128 tile (for sparge blockmap kM0=64) + 64, + 128, + 32, + 128, + 32, + 128, + 2, + 1, + 1, + 2, + 1, + 1, + 32, + 32, 16, - 16, - 16, + 32, + 32, 16, -1, + CppConstraint("t.bm0 == 64"), ), FmhaFwdTileSize( # fmt: skip 16, @@ -774,27 +797,6 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: 16, -1, ), - FmhaFwdTileSize( # fmt: skip - 128, - 128, - 32, - 128, - 32, - 128, - 4, - 1, - 1, - 4, - 1, - 1, - 32, - 32, - 16, - 32, - 32, - 16, - -1, - ), ], # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], # (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], @@ -909,7 +911,7 @@ def get_fwd_blobs( for tile, pipeline in itertools.product( tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) ): - if tile.F_bm0 != 64 or tile.F_bn0 != 128: + if tile.F_bm0 not in (64, 128) or tile.F_bn0 != 128: continue if pipeline.tag != "qr_async": continue diff --git a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py index 217cfcfe2a4..208877037f1 100644 --- a/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py +++ b/example/ck_tile/50_sparse_attn/codegen/ops/fmha_fwd_vsa.py @@ -690,12 +690,12 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: # FmhaFwdTileSize(128, 64, 32, 64, 32, 64, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], # (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], (128, 128): [ - FmhaFwdTileSize( # fmt: skip -- 64x128 tile matching blockmap kM0=64, kN0=128 - 64, + FmhaFwdTileSize( # fmt: skip -- 128x128 tile (original, for old sparse attn test) 128, - 64, 128, - 64, + 32, + 128, + 32, 128, 4, 1, @@ -703,13 +703,36 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: 4, 1, 1, + 32, + 32, 16, + 32, + 32, 16, + -1, + CppConstraint("t.bm0 == 0 || t.bm0 == 128"), + ), + FmhaFwdTileSize( # fmt: skip -- 64x128 tile (for sparge blockmap kM0=64) + 64, + 128, + 32, + 128, + 32, + 128, + 2, + 1, + 1, + 2, + 1, + 1, + 32, + 32, 16, - 16, - 16, + 32, + 32, 16, -1, + CppConstraint("t.bm0 == 64"), ), FmhaFwdTileSize( # fmt: skip 16, @@ -774,27 +797,6 @@ def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]: 16, -1, ), - FmhaFwdTileSize( # fmt: skip - 128, - 128, - 32, - 128, - 32, - 128, - 4, - 1, - 1, - 4, - 1, - 1, - 32, - 32, - 16, - 32, - 32, - 16, - -1, - ), ], # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32, 160, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, 1)], # (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32, 192, 4, 1, 1, 4, 1, 1, 32, 32, 16, 32, 32, 16, -1)], @@ -909,7 +911,7 @@ def get_fwd_blobs( for tile, pipeline in itertools.product( tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) ): - if tile.F_bm0 != 64 or tile.F_bn0 != 128: + if tile.F_bm0 not in (64, 128) or tile.F_bn0 != 128: continue if pipeline.tag != "qr_async_vsa": continue diff --git a/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp b/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp index 350d1803f66..62d40ffbe02 100644 --- a/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp +++ b/example/ck_tile/50_sparse_attn/fmha_fwd_trek.hpp @@ -272,7 +272,7 @@ struct fmha_jenga_fwd_traits std::string data_type; bool is_v_rowmajor; mask_enum mask_type; - // TODO: padding check is inside this api + int bm0 = 0; // preferred Q-tile size; 0 = don't care (dispatch picks largest) }; float fmha_jenga_fwd(fmha_jenga_fwd_traits, fmha_jenga_fwd_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/50_sparse_attn/test_sparge.cpp b/example/ck_tile/50_sparse_attn/test_sparge.cpp index 7c30a10b062..81a49ca006b 100644 --- a/example/ck_tile/50_sparse_attn/test_sparge.cpp +++ b/example/ck_tile/50_sparse_attn/test_sparge.cpp @@ -249,6 +249,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) attn_traits.data_type = std::is_same_v ? "fp16" : "bf16"; attn_traits.is_v_rowmajor = true; attn_traits.mask_type = mask_enum::no_mask; + attn_traits.bm0 = BLKQ; fmha_jenga_fwd_args attn_args; attn_args.q_ptr = q_dev.GetDeviceBuffer(); @@ -291,6 +292,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) attn_traits.data_type = std::is_same_v ? "fp16" : "bf16"; attn_traits.is_v_rowmajor = true; attn_traits.mask_type = mask_enum::no_mask; + attn_traits.bm0 = BLKQ; fmha_vsa_fwd_args attn_args; attn_args.q_ptr = q_dev.GetDeviceBuffer(); From b00e5449c8cd105dbfc8dcf933cf7849a57d2e04 Mon Sep 17 00:00:00 2001 From: Gino Lu Date: Tue, 5 May 2026 03:13:24 -0400 Subject: [PATCH 7/7] sparse_attn: split KStats kernel, add README + perf charts - Split SpargeKStatsKernel/Pipeline out of BlockMap (Kernel A produces per-block K stats workspace consumed by Kernel B), removing redundant K-stat recomputation across Q-blocks. - Add example/ck_tile/50_sparse_attn/README.md (status vs upstream pinned to ae5b629, unported items, usage, references). - Add example/ck_tile/50_sparse_attn/docs/{speedup_vs_sparsity,kernel_breakdown}.png + reusable plot_sparge_perf.py (b=2 h=32 s=16384 d=128 fp16 perf snapshot). Co-Authored-By: Claude Opus 4 --- example/ck_tile/50_sparse_attn/README.md | 45 +++ .../50_sparse_attn/docs/kernel_breakdown.png | Bin 0 -> 85047 bytes .../50_sparse_attn/docs/plot_sparge_perf.py | 258 ++++++++++++++++++ .../docs/speedup_vs_sparsity.png | Bin 0 -> 127494 bytes .../50_sparse_attn/sparge_blockmap_inst.cpp | 110 ++++++-- .../50_sparse_attn/sparge_blockmap_trek.hpp | 43 ++- .../ck_tile/50_sparse_attn/test_sparge.cpp | 33 ++- .../kernel/sparge_blockmap_kernel.hpp | 46 +++- .../kernel/sparge_kstats_kernel.hpp | 136 +++++++++ .../pipeline/sparge_blockmap_pipeline.hpp | 154 ++++++----- .../pipeline/sparge_kstats_pipeline.hpp | 110 ++++++++ 11 files changed, 839 insertions(+), 96 deletions(-) create mode 100644 example/ck_tile/50_sparse_attn/README.md create mode 100644 example/ck_tile/50_sparse_attn/docs/kernel_breakdown.png create mode 100644 example/ck_tile/50_sparse_attn/docs/plot_sparge_perf.py create mode 100644 example/ck_tile/50_sparse_attn/docs/speedup_vs_sparsity.png create mode 100644 include/ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp create mode 100644 include/ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp diff --git a/example/ck_tile/50_sparse_attn/README.md b/example/ck_tile/50_sparse_attn/README.md new file mode 100644 index 00000000000..c7191c8e828 --- /dev/null +++ b/example/ck_tile/50_sparse_attn/README.md @@ -0,0 +1,45 @@ +# Sparge Attention (Composable Kernel) + +A Composable Kernel port of [SpargeAttn](https://github.com/thu-ml/SpargeAttn) for AMD GPU. Both the block-map pipeline (mean-pool → cosine sim → pooled QK → top-k LUT) and the sparse FMHA stage run on-GPU. Two attention backends are exposed via `-pipeline=vsa` (default, faster) and `-pipeline=jenga` (async K/V load variant). + +## Status vs Upstream + +Implemented: +- per-block mean-pool, cosine similarity, pooled QK +- top-k / `cdfthreshd` block selection, BlockMap LUT +- sparse FMHA (both `vsa` and `jenga` backends) +- per-head `topk` / `simthreshd1` / `cdfthreshd` + +Not yet ported (upstream pinned to commit [`ae5b629`](https://github.com/thu-ml/SpargeAttn/tree/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a)): +- **K smoothing** — pre-pool `k -= km`; required for diffusion / video checkpoints (CogVideoX, Mochi-1, Flux, OpenSora, SD 3.5) ([spas_sage_attn/core.py:L53](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/core.py#L53)) +- **is_causal mask in pooled score** — required for causal-LM prefill (Llama, Qwen) ([spas_sage_attn/utils.py:L338](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/utils.py#L338)) +- **attention_sink** — column 0 forced ON; upstream is hard-wired to `True` at inference ([spas_sage_attn/autotune.py:L355](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/autotune.py#L355)) +- **pv_threshold per-Q-tile skip in attn kernel** — pure perf, ~5–15% on the dominant attention slice ([spas_sage_attn/core.py:L265](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/core.py#L265)) +- **Sort-based top-k selection** — replaces our O(N_k^2) iterative argmax; matters at long seqlen (s ≥ 16k) ([spas_sage_attn/utils.py:L345](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/utils.py#L345)) +- **Q/K int8 quant fusion in pool kernel** — enables a downstream int8 GEMM0 in the attn kernel ([spas_sage_attn/utils.py:L371](https://github.com/thu-ml/SpargeAttn/blob/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a/spas_sage_attn/utils.py#L371)) + +## Performance + +At b=2 h=32 s=16384 fp16, sparge (vsa backend) reaches **1.78× FMHA throughput at topk=0.4** and **5.04× at topk=0.1**, and stays above 1.0× across the full topk range. + +![Speedup vs sparsity](docs/speedup_vs_sparsity.png) + +*Speedup vs FMHA, b=2 h=32 s=16384 d=128 fp16. Shape chosen to match Fig. 10 of the SpargeAttn paper ([arXiv:2502.18137](https://arxiv.org/abs/2502.18137); Mochi-1, 22K context, head_dim=128); s=16384 is the closest grid point. Gray-outlined points have >30% inter-rep spread.* + +![Kernel breakdown](docs/kernel_breakdown.png) + +*BlockMap (`_pre`) stacked on attention (`_attn`), b=2 h=32 d=128 fp16 topk=0.4. BlockMap is roughly 17% of total at s=16384.* + +## Usage + +```bash +ninja tile_example_sparge +./bin/tile_example_sparge -pipeline=vsa -b=2 -h=32 -s=16384 -d=128 -topk=0.4 -simthreshd1=0.001 +``` + +Add `-v=1` for CPU validation; use a small shape (`-b=1 -h=2 -s=512`), since full-shape CPU reference scales O(s²) and runs 30+ minutes at s=8k, hours at s=16k. + +## References + +- [SpargeAttn upstream](https://github.com/thu-ml/SpargeAttn) (pinned to [`ae5b629`](https://github.com/thu-ml/SpargeAttn/tree/ae5b629ebb41e41f86b3ea2ab5a3283f13ac151a)) +- [Paper — Zhang et al., arXiv:2502.18137](https://arxiv.org/abs/2502.18137) diff --git a/example/ck_tile/50_sparse_attn/docs/kernel_breakdown.png b/example/ck_tile/50_sparse_attn/docs/kernel_breakdown.png new file mode 100644 index 0000000000000000000000000000000000000000..8704334155cbb0b5351293e9406d594de2ebfc82 GIT binary patch literal 85047 zcmc$`c{tYJ`Zjz^sia6mlOZXYsc0Y~5h|Hwj1-y4lsQSHP{tA(R5Ht0<~c*jSf)@Y zijXPOd#?6(@BMp^_c`9@`S1B2M|)T9?)(0%wXW+ruk$>w6`*ojem(70S_*}-{-nZj zRSIQg2ZchtV+{@d=2Ypn8vNhkD{|Ub)NC$ZaXxQvN;!4@imj#16-#qt4kuH42Xhc8sfkI)vNq$jPv`fcRs3??^ z$B(JI#Qf-X)KmYoEcL6Mq1^jQz|F7~k9S`4>u02`3d|4mxv6jQILYFgMVdg>eQj0? zk7Nzymh3mjjx$k*sULnnM(^oRtap)yRi?#5ik+VA@Jy`RyZoW5`huYthgCr$G%{;A zuQUAXe-!+9)SXxT_s?tQHmbzm{^y7I>kvh0AF==OdS0qk9slFii7T1Rxc>cKYdD`W z{$KvbMH|x$i*HDCcuRA<7=5Xp^s_TpmV=?Bp&_Mda>KfHzW7OgzH7qJV)mlK{Lam+ ztZRIvy^s0QXk0GxrCEPXnnT*i$S7fOUUGbL5-%>98fkZ}OM2N*%zb5n??y-30xd0V zy8VFS$4{S*X=sG^d?`M5#ZxKQs#9#u$`$+u1$nQNWNMrPR#;sR5mi!9z`Jpr%duQ* zWo5;~$5-Ci$Z&h>0hzd?9_iNIhbt;7)SVB!^-|Z?W?ex|Jzl)Lm`47&_`vkf&X3s^ zECmGxjW0ElhEwmT2Z?m$*-`Ji^zM365+B|A^^cO1>2~bck!9XY!>5&c{Nlxnj#{tE z+ER2^OG!z6?CgAUvEe=gtzTnPlg#8`!+15j7Y8@D+_00mx%tOpFR2?FcF-OYhRIQH$^Bp=LH`Qy{84~c4fSXo(ZeG=Q#3?uWb)YX|H zgbeQp>c1{&X<_2!c76S)@x~QPeEbfd2{o( z*{xWEx)^yqz3raMi_T=ZWo4;{hlgM2<_i9q{&@>ap{^)dshM)lucV|zHF&^FM^TaX z+3AP078VvSUcQtuG-Tg##Hk`t?Ky9DOu(i+_CvRl*>z|(Y+u92$H%Xi$G|S-ncOn5 zc1}p;?Vd}HXyw%C*5p{cnU2QSYk;n5?`A3uMNe{bH?Rp7cw+@_}jdtx`3 z@Z$W8tiJvZgk@URpVV{Nv^G5jJ`Wyr(zkAUlZk+cKlMP2EKXQ>IC;vaPoHw^-_P{u z(W9s%&VqyW(Q1A6srqkj#K*_0Ht*fL_k3Hj&acTyS!rp?(w~tu+Q?HiUTde2k-~Td z1lCKsO&`ZoA^-z)bFDeW#5OyR^;G(=Z@`_E6Jn7H5IJB(a~GT;iqRm%5nJc)?Ayxi)q)qxvmrhU^&`8Q&Alp^U8<_wpME1dd+^##yD1cGWx8oyL|Xola77)REduAVN}@ze(vf6h zjyR_(cZyWSKYHZz_THW^<6o+0e}CUZ&d0ZJW)hwYtf%hpFCBUx_aRA3$oiRa@ zN@u@WBc-UQNW1Xr0diDdJB~!Sc3QNjKDzWiw&v~I(mzupp7; zcKQ8n<5qENz4h72d7nM!PYv74%gbASF4}SW^l6ucS&_2x^0URB9@dMs;e3zM(u6Yf zk+0J)xAKwhBOY28!sw(!oaTYSwG;3CvMB685FvVc>ejmX8~z>cB(u+`=M{8}RK7|7oG`>3Nifo-HUQW;+S$;)RJ@$M0XA8Xmv8_U=o^iLWQg zDMm^9m}|p6+>%(HGSZsFf~4RtcWd+ccQJ>UhfP@-7`$)ZTti|t*SeecK%v5}Yq`1N zIhLQU_4XPm$4caPnQodel$AtTPryU0pnMw{QCj&)QO*6@@|Wu1@`i@>hYue{qW8ad zZ) zw`Bb{pw2QYDk?JS&l(oF2Z)$dm2`A$y?ps{V`HNwVein- z!^30iP>`3OZ|6|7i<|p=e?@?>xcJbW@?U-BN{gbu4|FJSFxZy&JN@`r`crp2;JMP_ zXL2(KkRG^B1bGJptX0?0Ag6%euxNXb2u1qz=?xM5I@hrEGN(?hKVZ>Tfj7#3)Y^nR z&2{h~>(7pC-=d%>eSTKkC~?HQ0H9D_wC#F=<(wft5&TF z=n)ALu{KD2cIts*n8V^9M}V8C`}Zdz9z1veIK{!wPbVTG!g}}$4|!r{X6C2|4@@S$ zzMVy_)Oiqi@7_IM1UiLh`$f(B$;tfn(fd}5iHWIbX!xO~;KBWIAoJ@YBA-2b*23}N z<9n&)`o-nB$)N$kJ-{AF4(S;gJ~*d7K2(2xfA_7asj(l@(ZR_nS(K8Le3*OsTu2I! z-F#TC6|eoGy10)t$AOyN7p<+|Wv66KzpVj6*VfTF*={_~Wm=;-LmOsr~SE!~`o_i({9$sbLeQXqXEG{q246H}%a$!7IDdoRzSRIg zxeS>Dy*Sv}oyQ`5>gdSCS#z-Z@bu)Sh=&gaLY-Ar8Rx%0kIQopx_vv^*O$7vvC)VU zm6<8p`Nr8MdM0r;bzp_JfMJoIv98^=H+gyKv6610l;qP7|2~nj zp<$eyygZ)BZE|qEMPob_bqQ7=Dlu{ItK8g6NKkYX^8PGB24OW`2#&5fv-9Um?~B{K z;};N!e*AbRbqP7qckkZam9uWm8hnwsICh&kl;tQOMe366+qY|5TJqa0cE53sj)>UY z9)Vq{@94NsWh9`Rb7Aq%lnF&!TH5<~^!&`&C7DMdR7M9bw-_NUm~P&@S#RF;>ebAk zAi5s0ev^D)w4MpP%%KyzsPKaTf-Mv=WQQJmL_Ljnib3Iha|;WHq1gQVLz~vD#OmAI z8|UZeGfoM;4d>HhrChvlVb{Lfc@6VZBh}}!FIQ61k?loLJx7aFpPX3lG&foc{p7vfA~>d_9abopYO1Qnlqi4>H{=xN)*{Q(u@Z0fwa}E< zTfH$VU12TtRU>rdId5fEm3_6I`x#y#R8bo5Q?U{{#t%`BsY}ovBv>D@Xyd2x&bIkn zq-=_pie9>O2{>?H>XO{&;2i7jTGphGl6qSK#%a9WmLp7iOMK`k^FMPxYhS#$8$fW! z$fs9(EIM-pCa0!M-KKw>cXf3&<$c9ZV8-B&ANAd5wjQ`#VG+D)-DX?!V)wZ+x3sLR ztP5D!fgT6chEEwL)w^n6)OaYnd+~UBdd>dwIbhy&-Hl!0)Txv#vAwC*-LL5=4*=Eb z_YV#Z(oxo}T^o9288!UkrAyT)6Zp(6CO?*^-^|Wi*{xl>w(C+(MaA*_y4_!k14-wo zih^lp7}C(t@X63?vC`PsxHik`gZCXA)CZDoVl$>0@$ueRPoH(jW;34)tPnzzUDRPQW6L0w`)3nZoJ-O0q*M_5e`S@O;hj;}D zZEFsgQoTxP2$IG+(X+m zC<92^!B8E&2!E*a^V1KJZ=RjnhVy1(?Fy)*n{Us}a{Ttqn>>?CfirR^PX_Ge<;^T! z`g03#QMih=uA#y7)us1?!|phLJvsein(FEokfHeZ?+??BeDI*o-oZh~$w`a^Fhv^( z#^u;e)Gslsj{P*=-vNuvl9_ zt2Hi5$_KHQ>K3`r+0%Hhd&hRvJu0`zsxya<(pltj#3ujX-_>#V@F2}@nn0zYD5!_s zA1-olTy6OJ%J-f7Zo3#_&7%QF#?GBmR=)7?i1S5?HbKl&#>v0fR3u*Lw-uw$mQVH2 zd3|NC)*0909|6TGynKB75X+mlYTp4C1eEY%TB9iJSs zZF36invNeoE(|UNAsr>`Q|K~&&D7L11G|~-w~j@Wo16PtN=gHpgbRmOKKG(am3HN5 zUsKqDMooY*8t;afL-&x=?H&35v?JO#v#{)GUy3;49(Qm0OVG#oW4^2^v*le#R(S() zQeJ~Aiog~y4#~=mnl!Dj>g(&%Gpehu_8<8ARiVZ4$H$C;!!vl&%DO#!_V~|l*HdLc zjdK!}vI)Nk+)PKAk#+d6C@9ca#rkD*W2M(O?&~!tK!hnoqcGf<=ipdM(s{Leq$P2a zZl3L<`dDi@DYL6ruPUC}=RRvo;|;9)R^xf}zDuGM%kln7#*qrs+CQuQ00kri8eliG zEg~!a)YmJI%AOyOS9WnZsHUP~bpQT+Q8($E#kS7C^7Zp-3$m`()`By~UZCXs_}=`S zvGd@OBl{?pm4VEvlR*Ii=SjXtY!`T&e#pE*&o21tbbs$X?7hI3-%;ryq zH)ORY>!=M2wfDy-NOKrry~DJnOlz5FydB4S9_B89=%J(R1Gwn85hNmJ-F1-0`!<{8 z>k5ys93k!Tx}Mox`=HoR<8iZdumQQpOPa?zNTk+JNj#kh1S04S@<$8wp4Rk<%v z&1?;O0%@X!^j}wam$7Y<3n;F)>dFhH>Rq)dGcz+U-e8=4b;ZnmRC6UOi}5+MQuG0i z+*`yfS*c5~p4_A`nLb>Xn3$+HcE4ZJA~4o>Vkvwx9S`r}!}rZEUlx_VIyTgFbLyTPjwBda`vD&|{Db8~YiC2bVd+N5<| z|2e%`GmincI`izfurC+!CHZ&ly1wpkj(PKmVULUg#dTT05p@fF;TdJ`kN3~`_WN`M zc2x(i?npH(e)s$LZ&r#JHY?hSW2v5F%y9fYI~w(_*J{sCzcrJr!sqIzy7huSh(>uJ zFl0;;+4Oo@07n&Kc?+b2!yv$Fe!H6z{Pr2(&X$6T?20E`n7An(~hE65atpE^%2^q)vH&d zID(#Owgl1Kwf&*Sqk@@V6ADU7)G7yAIs%#bgpDiKm|0j5auB;#jb@CB$@4Vr`t>Pc zqILbhzc=GFc_VgO^1p52cnUG{z4u6dv1!lgcw}x+Mr9yvGOm1AO&nV<-FGQ+ zB4%E0NP5*QeM87J+n(Kw0hEQ$G1p8(BP0DmI&nxzu|E*E5foXzdvi!SY`Voj=H`{( zEoYJ84_I|DadL9jynkPPZ_gRqu^vgi0+)?@&cq%^|Bz$z`OfUDOUyw#nc-=WLw8?y zVR3t}Q~g3B#{D@5-DicD7H13-J1?4=QiXT~DL#Js zbR{5h;z&ic7$4ud4<9}-R-bkMd6Fx226@%=^tG6~cXwv@p#3<3EmB%k5gh3#^q3dh zu>GJn3KbzkmX?-8C)d!^ujZJS0CGBx7KEFvV|;G%l67>l+dbr;;r6s(P&#%46hFfo zF%KWwA(!gq*-}AUDFf#Vt|HK&@G4<+K)4~S8Tylap%m4lr&W^V5EKE~nA1w~|*@@z90L&(0c6|M;YE`gGOA zKpkIUEOWkpq(Aa|Q&UrXLIU?xb&a9t+$vyYuz^+JXg{D^ppDeoEGHHDmiA^~@cAoO z_S@Ur?^+?xoF5E@f)0nqAKl>eXgA*hBQ0h$82QUXU%%EU?^%lBc)!UK3}zrOSbw{r zm)uBON}ay^xLeK4FIy&=B4#4K)ttid0Lz-5n_G>nOSXfKO@b9nX(>M5?n~ABgNF{C zwA}Nv-QLa<7-e>LoNMgXSHSXgi*~`A$9M9d%UZMJ@D)VY-jYci{H$bBJXkdV^31}UVTl^VpKsoV0_$6yQjDV}sF&xBz9@2Uqh^QWZkyn(m3r*0*op?4|lLB`W5%BIYv=dW=7B(ENIIdbpL3$VR%l+j4SpOjDiZ&{Q8gc8m&a z9`}+he^Kzrw{Ky4KO?-2p%w^>h$O0<>3wV(IsMy2re^t(G(bWbk`2T5?LiQh9>>Q^ zUyT0=`WFgRxOQw@+OS=t8{9ygMO8@n-0sWLHq+V%w_Sv|3ANq@wd3Xg-Ib zcLL5k6zxa54rSSVWdsCon(H6o=0m7(TgO#L}PhF~@2f7;e4q zb9;8}LQ*yyX32&gV!tf3_XYp$*RKz&s;b)dm95Hq^Cr^u4LXLoM&?8XeKa$&^I|ykVV#cTM<=0M+*XyKNwd(DUEdsZ1+@R6Z*SGuD z$cf@8kLBzu_y6_#_i?mt5L0d-wY>cieJ(f#q5Y-=+DFJ0yJnnBMZA{W zi9(>Fu3iDjtC28N=3v0zwi>oc4nB|bltJ|giH*I3qo{Q?KIP?TUnUWlK+J+a4vjuh zFlupngrleU?$tA!CEK57%e#)67-L7}m6hoT^}QIP+B9{#bnM)@6RCGKrND4wW%1M3d3g#AV*sWD zscEymwdl)39z5Wi(&~nyq|n{=?%jsv`OnM0zI}LNA8B|W$f^&Ui}kC3${qUP!7Zp4 z2I;3HuJ;3lq5A*fEq_=$3_NI;Ie0eF8W|i?)UohvIRgVWg8~=MDVEviXW~}yY@a@V z{ye8v5nhmIKS0YSd3BeTt%7^Em`#t~&Kepf&sm|P1)BFkVjSaPGR8iCgQ^Lrd;Q(R zBT7$`42zGVLG^|PcCulQlvLcdyXs*e=YoQQ7+6^Ru}tw$?ikG{CMNP8IaYS&f<{Dw ztiUFX|LT(^s1EU|7kn#1K=(0PmxEqQ5~+Im*Sfm&_FI1Zm9!z$>B^OrDjx?E*3#2Q z&Y2pzvoYKFf-i_n6i?ZoS;R0jIQSSU<}46r{PXAjh_`Ta7dMSLVD|i(vkP*XZ2dDb zI#hS?O-jnksZX9f`D^e7G1vB zb5T)Q=)Jp3$@2QoNy-|I^pTa2Brd&+DFu;n;n~L3j$m1N6O`kB&zRW>FXEp9mW6c4 zs&$ojG&QLXyPFK;2M2&-G=kg#@xKa>*$6TO!pvi=sA{v6=R%&|42`+eaMtBk+OucR z0{3RNSOEp+<#$&&C`M>N9S0EeEcjhp$Fbyh7&&aF?yFns{p)h^hpZb8J^5>0#SR$~Z!zow=JJl14pWgQ1gepW?Ag?q9}y`uf{lgAZ9 zb-DZLm?<;qK4@qoW-Z+0!knx?veqt|>gkO+jR&3Je^&};V`M!^`B_l6$A z#m9F`oL?OwY1NTcicX1GD=6Ib+fr!z-_F9opf@Z6Dm(nDkg}4}mtjMRMx9UevOuD; zu$b8X_$c!2`*#vGW-yP;IThViZ!KqddQw^WD;pz6%AN)wj}O?KUAuPm;y6`3lJ&2Z z%+J@BOJ(ovyvkYzRQ?O!lpGG=R#ssn>BLT$HiFlEtGPeB4D{3G8~gI*B~y*Aa?|gf zxkBiYNDQEHJDFSzV4 zR7#*U38RbHwP#OxbFdDYRZCycQ+dz;@hiIz;X}r-5gn5AC)m`c>rPfR{ zkVGfJN*Np;<~@7KE*daebjAX74aoivACOtjcTFH#Um>0E1GkkJYz9nTR^+IGp*R0QTB8gHV_K)F?X>N@dt?Slgrn~ymCV9ou3XYV?c zVfW>PwJRVM`Flcc{S_!6hI}2^lXP1vRqt)L>;^W88r{g~=(i9=wonE~N8h1b)TF%e zTs*vQ|NgJ31K!@0o*XRa&?pxOj!S?vU9WLy_W=%qqS|pp^YXS7?Tl-GH8Cn9e3iiz z!^{h7Z5F}EJ{>dx@sKT9p?Ek~k?ZIUB(c!f23>>La3Uo+r$QReKw%Z7)T58?Us8=m z&en!pzlHKFDXDJsbFrbh>pWp|mgK?*B%eIK=Ii?jos#R&i;qwWqk#vV>G`e-idJbu z8ksrT2&+^?c(}%1NA$;Pt;QxMb?$l~g6q1+E_9chZjAc^#urlYtr-KysbLMGR(fm> zl{X)Ig$_cAYiYY?#3H+di!gNwKR^FRWT(6V$mhl=^8Gz&fu21Ko5s{!J3oE?YSUe8x;t*xe5oJ+7zb-jjEuMjr>?E{x41R{LD z7cYot>A=1Vv}yTfK|Sq9(z+*3&_J&1YidJY0saj_hc5z2F~?=xn8sV@+&M+d2M-@M zxVyXOSvg+0qD}Nr{EEuR5Xmjbq8HBWc$o(+S@Vq(IOVAypYDO)wH=ur`}{up9;8JJ zXih}PLYLCz|Ek~B^EOzy&KCtfIWXsFzPfmd#v7{FD-e6zi4DsA`m%r%ksknyleRuuTaW}JX+`Q_AWPLXGHdG6Jz5j=!ub9Xw$Xp zA_R0GD@UQw*9++Z<#!!6F)<+u)w6^I2xmy_b#(mUEmcJMMSMfCJ-Kqgru%i}{sRZL zPGZ}YB>HFC)kav};(e#6vV$>m)(4Mr;Wf}DjlDc$hTj(9E{q%OI?-+ZUT zJYaI>QI9_)!xI6Xyb`{&k)FaM*mAyG!NEcq3y&4&RoIxBE1}zMp|oXNXhHpMNO6O$ z?!jS4ULtNGGmt_D-Zleh`W5nQKo6%OSL5#8Hrv9*8>_U>(Pt$<`OkpjnvUf93d~$O`p---PCa6vMJ*9wdg)s7^8Wd6-Z)6U zuH9s25|ja2R>ViTgo&B?6I4Xkk;ji8M+FDd?}KvzOPdUe$+9YF3u^q67U??l9-p8N z5n+B4tsj8Wt;OHXXDp#)@Cyl@x!4KVBJAM-f$7lU!-~o8-@hO74B|-?SJt5EAg2m3 z`B&Bu+FLbi6^mkpPNXt9B?Y1?ZF773CDd!2j7#>Bd})sjH!IEG6Yrf&SxI^wVsapX zif1Qw0Gt?6(87b?zrQ7Ap{cn!!J4l~DtsVKVqOKRw`f3`U(-?`@1LXU9dXHS23M~h zhS;{nqWkp$a%NDImDSYlktDf_h6Y~A^pVp6v;w3Sg^pbx{deywYc-yv-AC2h($GLc>$I;rfZ%*7+-=&69g5J!$=$}Ku$^$ z^e;sl5#sOu>d3X>;+)q-s7!)<7-U}7Afe^Y?JCOcY=E_d*oJ?9`#_Cvl{z}iAfu39 zA1B3*BTl$A5bTECt(@26ZI23q6vZ(=`7n1xH_L0;^Y_nA7OCZheXtMcbhq}cq@f|C z28^IzVztlkTF?IKBc>+Ow865W>Fl^>F13;L4#xwrKG6QCy?;E9Wt6FzF{%yQ3x=bz zzw%by?+PuMHKBtbk^eH?S`McaG~B?@gYn&f>Oh;hQFR01+ZLWTfarjCs8?VfRN9}J zv0#9A889zM^{whhWY#1egbV^s`h0zq2w1?sk5;4x%;STchAgGlt--1B1AJ)IA&1?l zvWEEc*EdhqyO}wzhYV+VE?%vFAWpw~_ikdUQ&l4dUY~^p_ZKf-7{LP8*xZ~ndYMs1 z!7nc_Pi0IGzafUR*GbYvj94cSC0fo0*34O-g?F;Xs{`;K6r?<*1Yz|HED_f6JgAX8 zujQjqug^6c*!7Qnf1)tnpYy-Qe+JdHFCZQL*V`ZN4H)p>pR2E+&D!|i!2s2@JdG#* zI~btAOwQ7Li1k^V*Wy0G2pP%n+Mn4H(l`q^=!>Ni1kEJ{5cY$8Sf35oK|MR*K z#>E%o>u9qUr7I*qK@Br#IPl*u6q*9dL94zW?=gq}-!BpmZtK6_`+xTo-x8lJA491m zepZD4iYjeKEq@%`a7+$7d$t=SJtt$p%QuJr^^PH9tW#$5!9d5-pmjl2p<`4bhc@K0 zmWqmZSQyiBdHH6K(0?tPDrd`+pHhQJNHq-&e(-11H#9tYdi*(s^J}qykPtma7XX}S z`oDZR3y?FM%J{GE{q=~PWXF;T`8E_`F|lVHcpeILvTHy=lwRQBR=sWiTC$p%pcPij zB9NIV(vu7uWfV5{yFH#|JNeJL4;=H!S&l*<1UnxOPvfIK*BL~>58_YvpZqiOp~}f* z^*>hY|6rb@hJFO~?;DixD_5=%^7wM=OU1Y1un4a3p@39K3tOnQKvsY?am~7QNo`lv z{a^uk0uDptt}`SmxE$945#URJ!z)y*ooj>QSK8W|{W|H`Ki^x5DPcI3!l_Y-o)59G zX3ZLJh}@{Yn%x>ZO&PXs4FnC+hr+9Icloj0t#tF|-O6!CcdXyKUmAGq<XJTikr$D4k?FnSahb{aIQW@!;qnn;5 zB)rwHLayLlEHDZRPZDfxsIFd(se=-r{g-ce|2d=+vN*AGeRNy*uR)_q@y>S`lHp*G z=3t=ozE30VtoFHcGEh(e+`3dMQA0~Xl|g!2zfG(HHi`=o*=W7Ab#=*P2Oua>f^0m{ z!wwtj?KluUPPPjCrOA5vj6oua=9@Nc60z$)4sRz! z&a#gmSxu^fj@jF19zOyz5Q5sICWN!GwN(x+N+VPmNX8M7Vw{{*aJBovMh0ktSsDsH z5{gQ2q#zd;^}n2*eNF3Gg{5~&=^#O`quRuD`bp5G|*jF$77t(cN6)@*?{Tcm((l?N-tPga>D57Du4*`zhqi|5*2I! zu})0lEicYe1M!%(r|y80y|(W8w|wmm*IhXdL#Oec#~?Z4r||h7S5aXQI;9gVMS)66 zCEYhC_s;=*+Hdxar_UAaLBL3MR+f>maap0~%;*j>?9eZUh5xz~7ZGuT7%QTR(C0G` z+wHgis(=xYm1qQHg4rY+UtKyAuphKl&AWG1A2UosLqmPIl_KInmT0FNQNg7ta=C?P zji1cI+)x}S&(%Ezu3TrI9$IhKA5F&oC;R#V%FZrdykvH_@|6lr%n){Qz?)<~odEJVp4IyM7?by(eN zNCmL$tfi!9XA3y6$?Z7m&Wa%m)_oW0U`}pmm?8*;EECN0>lPLkLa_31RF!sj?*Ju$ z*^_NTahWml6k!~7Ajhp}U#nqQew2{l2M)Ctd%=o;iWp zpIkTyu93LDEx$+__MF5}1_P=umjCULwE{PmLV!ymA= zVT`$WCEAg}{{Eeg4`&xhT-p%VSdD#SM@0(NEqW82kzJrqnN?0>nGU~r$95ai+MT@rdf&NBJyml2R~ znD&yPddK3w%d2ZGWBDxGhsLuLar@zIMcDhg{rvHRjzTE>%WcVfGZygNjIJcL!W`B$ zvisD1X<+I|P0RyG-UR;>5)%`2{rXBGJ)$t?!Q~t0wk&!V^3RW_bYS8Z8<9YJ!mlDO+(Q)ACD~Q7gUbvQTLK=UYj9$&H zueX;1YpcO0ga8p#&Ns=%t&p6%!7Vs`1iO={s`vpYg7 z%OBUdElmCXjezHH;1T`z0nU8#=*jixS>D(25MxsiJw-q@iw=!_{i=R>3yy}MgXaKu zZ22i=34Vyf@cTS{x+_2ez=b>F`?qfjPrD1<#K}|?C;~3XA-Sn8jR|>Zv?y1RD5q=9~DyE!NZ5`JN^9q zxiAg@x1nks7@UZVNO$g4?t!awDh5_aBXwP{)>8FOFL14SsbpXf+u|V0fAnaqI9)#f z&{;5YXnia#Efwo?C8~>ImtEKPZ?Kxkijp1scrr2E53?RN?{be8 z?-`vYwu820_6f6zRdB+pfL_8k?Rw)(5AF+m__0Hmg&rPE!x*FyOa`jWO!Q$ZOgn?2 z^gl)z9z@kDD=SNPoxDi0IjKo8haB=IK==9}kt#Htk&{bk@eqc6{jhm4<|^)2h>m-* zAGDPo?DB+Zk2^v)R^mb{!fVx@(=j1Bn&Z-$5<_|?Yuy$wS@G!kbNSMLPvrg`zkN#k z^V2J0zQMusRoK-ZQG9jkrE;gPwl-z9m`zJg(rYPZrUPR-=gysj0*^UF`Bj$%9^&C4 zK>3}0a)J|n5EuYTaCp#k(PMf5;}e`BA{jEKoiJRU@e;sUMOPFWx{-7*srGza^ma!v zl=s50xJZxcTjp0clty?v6@GTF30t0(my`3@RBH1tgrzV5ufz>h4XoC6D9rX}Hf>^M zy@~ABi$ff)SI)kz;ll|9W>t0d>(U&(>$f!2*2(}yN9WkIC2xVcfi88k``m<|?owa* zYUCyP$H~OJfujm9Rq}BIz(oaR?$A-n8|4eM6@N-WPh=xk{C@E4?^#r?4wi)8RZ zmz|ZBC&Pe)K|POa1%?}+(!#d61L@(KzyE4@fr+aIMiN_k1)ke5l8|Uby)T8wO!fxd zF{BWTMhcN>TUhLd5)3heMHEMv8O~#3$P7FmXq}DB%zmKQO|THG2QG){&HutU3Pz~N zu%8)NRBRlP!ZF0V3WS^9djT&mFNk`^XtfV|{MnCVLH$VOCgPsV)kt3bU4o8Ui9ZSd zItq*dKFbdxNMxcXCVt33A=nvTaKTSdozT%#q5rp?96UwZ*p?D~)2~!a?~dVZuMlSn zPzk|9zo(|gA>&{!YzwiPphu3LPT~Q$4BByIGr=Wjt*`$6MkOF1AmKJmM?`}a787jC zXtt;+o^t~%`6I{wh34kO5Lu5fm%#mQT<;+Yw=kd@H2w{c_Q?bvpcE-P6lsVTX!k@e zz1tqGwkhHH^YY5dRRn_M*?j@X2K0x`Cdy6!H>a=KGvz)HDPB0#o;)$}D`r0%e>;SC zKUjI7WG)__n{H7h;4DA9cC>K0dNpEiFwUNgpN26PF4Ma2X|YDoXY7wcjUrrQ>_ z9e$n6l|bedcy;{Cm$WFcg!jNsC`lDKuU@LDoHwaM|5=u#Rx?U4NWO=1fjin`usVW9w&C9 zyWC1e=h&qCZWow0Khu8WB(Pb?fXpJcywq4na8ij6bzRIEJ0wIiFsfpfAOAKhC`p={ zngtHhn2x{no(&ND5t7V6D$76ZY-iu#;0>6EsVEo<2Jj$*azJ>N9a*GC9E0!$1q4PE zI?zyb>_@j_0>KyLI)F#zsq?!)xQYSe?mTdS8KPMQc<#K!P!K@@NUgmvJdr6gkZ$0l z)=NrCMmuf?5cDo{EznR`7lx0Ym~>FukzjBUz#;M~N>_2^+@C)ycou!`02=*8q^Hpn z{6z@hqWW=!SofQ$1h__=D5*W79qH~+aYmUxvAP~+3MvYG1cnKxuh;NlP8r>18KUUB zM2ZafA#W`6(oFI4BRp%X{zID;-Rx_@3=B`9%VBJ{FgG6`EcN(L7a`~Gosf{whh^n{ zmImHutWO)gF0|Zx(M8D91)!(2hSi#i=``OaW@ZiJKj?!o>>lT~10f6! zIr=-ER0WLOs3S~;@YgBwgMGQX&GBAboRH(4Z;!#!olVjd7@dOR2UqLveTB%BmR45z z?S#8+ez9r&`t_qzcyPrS3owqGU~K6#0E7RiVs#3YpVV(Y9;SCM!ee6v9cRyp&IYV@jboH|E)yC_g;Rj9~&_@EZi#qYtIqBDxZmv4bGOy~`ejlW97Pi{g~} zA(Q_e?cU}vSYP_|mWkh;;m7HGhW?KGnGL#Bp~aR&H6?*Bz?ph~5WZpeJuyX4Ai90W4kcv( zerkh1;J(7oRRI&`{lMVY!Q{q7ZF+h-k4PaBsE~)Wv~;5Baf}~*fAaWot-B`+4DNfz zJ!mQG=;*#0pl<2+$9#U{ER=Ni2~8TAu)u2GZcinS2puOPS!-%uL_rp97OwhuZPhx` zm{6CHc`@9oknqf+BWo`jpOEPH2%RmI%{z7oWLRUG;RmY44|ve|kt@y3%+f)5C^qal zbSOID_|5<~Y1hWav$z%mqeA<@_p$e&t$+E3_8$f=?FB53UCX$ZVXe2+eUiBp&YY14 zjRv?%VVZVI#y5>5OA9*?)y!cDiS4*Lq5@wy1jFlKRlg1n?!@RXpl*atB=S_>Sn=`+ z6ip%vxcoBOE_pR8(}|gqXW&dRvaz9$|5fUU!xpOW2B{5qdOXoeGhh~s{4X{7Qs3M` z07~_^feyorFpMUZs{VI2V{+iB>d*iN7<-otA*QH~?Ambi<|5g$ z4Lbxbqb#GX;K`ta*>H3j_%NX@@!PMHl&*z}OeXf1?I;MUb)c;W3X9O8B+AO)q6Nmh zzIKx${}VIaLkACroqPVc+|;Mc|5muzm8=&KV&lQLaYvlJ;*u2*a5B;HuD?r-sLDaF z2g#;LhN`(6pYMHLgesbzlSAByu;f-qGNnFsqN07z2{{%~YK&fo*zU;929QM~y;duI zfXMLl*|VLToIc{)w^5*v_rX(dS6}PDTbZEp#P}oSC7k2K!4N+ zv?pu~IBwsbh;cdl1jvSbcE*4D!D0-g-a`+l_Z>{jt`*M{%p2o<^uFJ|efzwnrQYCg z%)ErS6{n=63@F-|nXQqPEf;;x^4?=Qh4_Mi`(f2xi$;vn%T~cbZ$~+B0ouq{8#7bPs5jxkq$`y6`zl3&;oR$^~9G8%7_R-PN{Xz=k*MFT=VFf9NcxhZ* zT!;k@-0-*0#Znkm@ZT(O&h~pnxf-PNjVvtHtKL@7|A7-t1_rws;ZRgpa=8N#4@j~~ zP%?J7R05ESTx0?*pcGX=1hY1&L$UBcb{bh(-9`(nrTFImb%Rcg9B=}J+ENf--O(n^wOIgbw42mWenwarpr%?y`kec0JAdt=I9MU}t-Qwt)8|6`H|? zVmx34B_uj}ZK#5Jg5!M*ZNei=0cM6@S<77K-^<}6R`wR=e|H(iSj!o|BF$FMIe}AU zL;U-u)l;|GfX*(|DvS4 zj-o`Mxt(dP<^I?DEX=rfcJExX_c^TI^)ZJw0aOxjhbX=;W<~s6f~Qb%fFY52$lM1k zQ_{HU;KPTFScbs?BEZ0EQ4RwmC<%b64#ofi zx+#8zSJ6|Ep%uVST!q4kCC3!6t%Jk8kCEtFahKG3z(jJb4erALg(m3y;3h`LC3GQP z5ozM~?e+hnzOfiIz4)AhD?UCxAg5iH77pz`6H5mX7|wNMfDg zXsQBV3nmyQaUY1>pc8L{L+0S+eOVz&e9zzqNG?R-&wU3)2F`~HR8N;WfEXX-@?)6V z*jWs4X^ak-%r9Z00n6zd=e4wjBwhu+_Zcg_5s5pIn)TY(|C1bQe6MDOvNr)}F)%V- zLm@8h>0yIG!3D$8ggQW#A8!ITy%i|6swR|+iUN}QIJw?s>(<)!pX+cV4S|oaN#iym zGBE>o_n46phe5IDZCHf^4lhi1+F%!0k2vw8~bd%wCS7;bIQ9xOa5)4t{S8rv#ACY6`h13=Sr; zd&R|45ao!yRr!I~da_aEMvr7!Y`p>QQUQ*rRlWxcIVjK3W#Eu9R5fxb39QKJ6@+a1 z0Ot^hk_{xJ(an$%3aO=8YLtd@loP0Z6beeY87vCO2_lvsg>g|AMyIHd9$erF9BxbD z%MJpHEpKlp?7ggl!Wyi-P2rn2N6^9gB6G&Wb3pM%p6CZbK%@{z7962N%S$s-kWR}X zkbxp4H$q`9bq!d5Lc*bz4kbhmX?JvX%I;$9gTVHi zOO55s2a9Q)j0`9j@5XK5^p4 zaW)Jx8RHL7Bb>W3ajf7EHEZ-Xy?Ak=2~ArBl@WC9K>-53VQ3s4t`z_uxC6Em#=p!w z?wBei4mtw^gMfJ+D=)OQ`KFj@*rSFy^!8N!FA0T^<;foiqY$xb5Anq5w0#S!bo0Ds67F%0heu;g|5~k>%YhEKi=HPrH#oCn{8kz zXvhL(qMDM~@6?B2-7CPxzeQzn{ayhd4EHbtCTMXmwuu<8w8@vKRwWjmWMZVRyT$u` z3D-N3ijb&h8gSk{7*gCECv1CP5S1>N<4-jAaCaw&wNOk{l*9?9=5Iztu~Kjcpf|X8 z3J0Y1w{PF#ZYyaxtx4w(aYFIy5zsyXhdv;KZ-BU%IujFBTwGi+{a@PsCJ76n|1j?{ zEu0S~KBD&lO>JaqTFF;lQLz#==^)H76=E2M=Nrq$T2p$7wz7!(mITS|07^%24_v@P zZF9~5+Iv5U2;&jB&mua>!EV!T<=#S1+&UF2;kp9X1_4V?bSQV}o;mk`nfb4Z3_T$F zA~b0-G>S_CZsC}caTQ>BISkhRLUm)Dl!E6L<*TvS%d2Ms`Jb91E`AQ3|GPM;V65UJ z7|<{t>9G4O^q3r6Qs3@!nxfXusrdwW0>WB705_U-(R z*0D%CWUL8VHmSiNT|DQ1iU%C>t3VuK)@3ERhw$agGVs$-49dxzK^x&xKr>>dn0PL_ zy1Q{Tp=3UjnMYFl6NYVm#AW_CX`erB-yFLH8YCl^{U< z)uwX-a|B^gUz9sCXlR22@FX_Tvr!=E4=OT3f}6oB8V0KLhS^;XuGYP=#(=@sthr z32s(0$w0~o65L9}U-WkvEO1fbYCN==nVB@MM#Oh_S(wdeagdHRSSbKCC_&?KmF%1e%XAEU|<|t_o%k#@!#iXtAqsIRe33poqqsVEiEFDJUYngG--a zb_E+gKJ(d&G}i^cQJ-aD7YqfQ#_nT|_{-O~_lUlW{-)|da$J}pUeVXze+|pf4<_ND z%P$UGp(lKyhQYG?wZ`ZIN(#l$&=8j=nBbX+sS3FUN5LPg=E;XEi_PdCPB09)_Lfj3 z-m9j5@vC{yI$By{9EkS0*H3kfLWH6FIR!rb6InmoIeyLtjELd}n$$gpc9qnjmAKV| z!V{x(iVYG9MdhQ3HH;(nR=Qirg@XA0=TR()eh-FCsk;hziHu;OpTP#2(o7)+Ad;`f zb#-Nk5%^=tnMa!5eL99`(to2XR$bQ6u()jQ-}UWeQIhln;6#YK{37E{;Ipt+ME8$5 z>_`iIO+z7?6X7~g>TjT;Ief>2zBA#HQA57oP@^irUlL6P?y+9TDSC%cpQ~{NR}d=t zPF#ed6bCGV{BR7%gix<&dMHL9_-$=%OOPl&;OpWL*>(o8N&dxRq$sUF0KCVBL!Ayw z+3&H>>~M#^{2DQD)70MXk86#H?j9FMk4->@Sx<`FsDIF208LNp50<(D(JKMIc5slsuYVN>q#P*|QpB@O~fw6X*Zk6ElzNuipVTe#n|k zDsNcSiDuEeTvQ-hMKmgOwkBAKg5RKDZ(#TlxAc|i7J(Mlu-kX5+yua_hU-L3$Eo7O8r3{&8MHw=c zg+xLr8Ip*U3@I{|c}RmPLZMQjG?yWnk|8CEObwJWBx%5TJ(czQ?S20HzxO`-I_Em) zT-&vli^})=`F=j{_w(G(ec#XXe&vC+peD_p&LFK1?{n%lOxLfE8}mRH zd<-*CS8wPLgQ);E=?N~;=-pe1KQkM2GhW&7Scc(}!99ET=rOUd{WdTQu=onK$l651 zb2B^b4`rTxO~pv?Zza-2 z?y5y*qdIA9QbwY=LUBo##x-(dWVST_M7_QqAGYIO5kS7K!UE20>!WaH^a^REzIWzS zwuNh1N!IOV3l=W?w5c;9VoV=@Y|b%#9)RM=!EdsUe^hVyse7x`GS83dI|HUb{{&~g zPJ7)adsn~Ae95Kw4`ccmFyxk#B$ zLKf}4em+d3DhOMwJld?geya}+wGgtbw&Ah-0kg2;w`lt zhDt}ba*r7usi-c0|Jnv!{yg>i_9E@{3%@t>RL7vxvQ?`SP!tD;ukFthccI{$Q1I|E zOo~ZH@7bZ-(!vrF_t|N=ty-_xT4c}osahCWu5vLl4&XgLhgx+Ty==UsXZrT2Ydv`Bw(iqT&*cV@P3z0>UoxR|Ag9=d!q3sMuGXic@t319cKI#niIvVina7I+ z+I;=@VngEGToh4buN`%}CM-n7R3_529BSQ>UEsovxyl4fi07*^z_9omh7ek~C?{@+ z#7qP=1JK|Y>?8mN0@1|l`ohBp556Qtv2S3mZc@#urU$;gATJ5Nz%F;?lJ+Ot#|&!( zvwSPMKjIUkx?aiHU+{EaMgs@cGkK`dT+>6~COIx*K2xc7ay3PHAGruI_OVYFMonfE z>)MU)!4^e@r^v4Hgy)DfG=ecqa4yk+Eu?HSp2KjnO;8967Aj|^!LR+n_@~C(89O!> z9(DQ973c)pOFq+8@_GHY?X*wNuJ>uZGL18)uWN zGQSbu1!}+i?7$qUK36KF_imj9gY1I4T8{?_RQ|vs4j^Fw9q;lFQhvhv#sLu{nO1M9DZE&+$xfY)^nvWR2)1+(i zy@#*Uwujn=ezViBXxnSOmwef@tE$(jgy3eM)Mr;{bYH*cPSoT(>-MyWoxJ1tsm~@| zFY<5fx~<1=KUr5}d_tPeC)?=`Ufb^PJ!$8?PW^e}JCn&JHco}vFUn$#D}HuJT0Lmx z^q!;N?ey0U zJC4Pux_4S`Pn|}ZeTl0}6vfg7asEC?%C#KlKsDwEc=p=8F|K#XK;5-x{rt=@$VXVe zstMJz8RiY|nUF#7GIc>seXM0bwFA?G#PL9#{17!tMq&*aXbt-(Q&^}!Wi}4H(pAt7 zE>-Hj?ib&A3FZ`PXjh{Db*^$@0 zwU`Ltng05(741%+vhG~$KXBl69$-I;tqNL2*$+*HyZ7k%>W=u{L^ndEM)cjL9uBXkm3vfV^sQArnsUXgJZdrMc3% z3R&v=)_y;%;_-eDj~hoynC>>0d4V#n_fPNikVQ?PyZT}-pgY6#uf2SNd0<>|t${v( z_uhxB_wRR+#XEKHzI~nRso~dcqvs$snjZ#P)pEMq=OJ6%SIMz_wor*dK_&}JUAAr& z2PgOY+2)@96B*1y7n;lv+HU6So-Km{K7PC+**=_Oqq54`ZzXL5>|NXBzyHTh#)?WdV zLIXJCdSw>o1scW#+T-j}T-m*@_vGO%W*wcJ`6VIzAu@E2*mM1%$AiP$mwx*8?VDr= zX00)wsiqKlyvj8^LIg+8VV9Y=k~w^KLcbfLbe&@^JJPOcfnS)6{h|R6;dwv9y16)+ z{Y$Al8atGl?=G~HW%ugM;<2nvdle4rx#`H|P1C(D-d*<#@49D4D}^en9_u=BxZ8yE zwu{FqAMX>0pF#Y7OwmGG=+ke?k?^+B!UILo(5+SXN3a1cJ1WsoJE~Y zr_lQ%UBy!9K($L52a~jg`O`M-%He`0dd2&9V|}sPs&s{$i(rRkJbhc-e3*t%XU3Ca zeU)0xp|1)hz-V4;s$<4~$0pzT)de$pL8;ka8n4Y(Ag3uBY@W>!enANox@)JjpEPC4 zO_nr3@9L3=XQ`>m5*YXb#pkue5s(&&HHp;N65fa*b&-AbA4Nw+eNL`*g-s*P^a%6( z4o{ow+tq+^l!3FmjOzgF1=5-9kn;MXS9?Qlu~}% z>XMutI^FMs+JhYC&Iq{yD#f0rQ8f>0E79&0MWE0o%l8COGQ5Nt5lbkdj1cqk(qzSP z4D!8xphJzhplS-Rc<#0&Lh3=ey~2wNm%e{JIrc$X+D#HC>{=r<34U-M`G|10Z-NXP zlY#K88OaL!j~!EjjgralyhzZqy8m`ii|KABgPSS_KDyhEcUeo^I<&a`i1m#Wymh&n z-CtYweW+7uR^lU;+E==&yZ^7967wQ;+IXE*{0!RgfOdwp6yj*JqY*ynMw8Bjl)}P7 z66X!P?Emyu{XY6u6BnEW2RuCLPAt;cHsx=>LVty#(U;IqdBsKz!uAKNQT^fr7D7Or zZAJ_-TOz~!fxs5rlR>Hbsvps&=I^nO3XJa+77?#L;!NHy3wDUiBGi zq>AT$6axdWRs}?>y$Gds>#oP}~zqIKwHS45rtL-?bMkCAVC0`ut zq#Cc!z5{NiULNz;)Jv&2;!ey?q`wM6jo;R
    !*laDTdS?US#enL#ct3Jd|Tf0u3 z6^jX-s3|gi$#^k&M2|5-dyuJtFekjH=(-;P@+MIXgFu=bzTrn(7OgWpFE(pl?;|Dc_Rkn?AJA>g zqSeMqfvPJ_bR?kVdxoxMFf6oTy*_>V04PUR^%G%Fy-My}YqfxaRUs7`Kg@r;pm+l5 zWUQr-am0bmKJ5G!AuU+o;H*9Ccqr0_p_|d9>h}KD=|%6nhvU{ykOn14Y|={|#vFhm z-w@EGAAwzXb7C|b3i6><=ka@LE2M0t6Ks@)MK{sV6>_4*{~z^3sq&qjokjIzy~*HC zVc;kXZ;DTVasQ~b((omnG+62|?$uJ9Wwb5fW+#fxT!3Cnd z7{hF1h-POXmUJ6Qznhz56-wIj`)8|@bg*@5r)l3Df;zW&?g3Q|jn=ps@mAJUNcRQR zE#4Bj1ce+Pv4$oUFhpbl=A)hP5Az`t#1SB!EQv#qZ=jm88>blq-_c4ok#XnayQkp1*`gVHF7*5!ux^Mz{!^ zK5pT=p*m%LO6+4oEN((;rWf|!NBc`t^2ckq&$sE9(061IA8#jV$7Fr>;*8I-OW8zg z5|3ggws_F7EiFhS5bf%-EdD+`u)Z|sv5D5a!;>l?lygf)&+}?IzY$L&chesF^mzGt z%mCa;8>8Wx5!JA7i$nB(D0fA4Y@>iOxk_0>${9m>tiXa|z0n)~M?A-{fHV`yMRd)V$YBZ7i#D|$w=bRDaCm_Z}6^dnG!dSc-zNOS9iyl_V^bL%=}6MpuZVE z!`AR;gsOQn%_x`^AMFn-mtd+mlz-Ai@`q)?uBh=BOv5_mw-k>bNhUSXNV z!l|@J>L>0Gw@Q2i9VKnrVhM+|Gtg-@@@YB^!Icl_)?QBf%5q7exlh0kvP1+OT|z}x zBP9q*_hFYAuOdPd@mmBd7IkA_D#EZCtUBYf9CR%|neF)8v{HdMWV;Rn-}H!Gn9Jr~ z-Bd?E?awB^)!e5(-M!z8APu)OR@uclNo@#?$O0XptX+6KJX6^9y;v${=(d{Y{#@%< z-ihmgnP)^#rW<^trzy1GU`zQ+RF-L zWWJ<^$TSw?Z}sa_1GgKpbULgx^HPSK>(`@4z~4=by4^`kY)C2agwe{&ONrhp$?Tvu zkli>mX?)moy=;e4n>r10&+Oy`BEpqV&|U^)7{*oQ!fWv5YZ8eD-S>xGOb=6eZDO9V zE=YO910rBRKrIEQePz!c|7$056cOb5<{D>XPOQuh)-FY@m7Z8}H4r-<`BnLAPu!@wt1kC8HLEqz2`_ zB5jHPUQC0i41nw+`@b3JKaW)|!WZ{x7c@#d?++<<1>h^g<)owKSwZdQT3b3%iV%5k zA_qUj_(Sg!h146`P6;wD*B|L6&cl7dZ5>5tiyY%4)DSp(+wR?Uu6$@UW0sYXD~oBg zN>gEeX@6|X3bTHFy}sYZF~l!2!Jnd%vr)||9hlm3+8gwZ6N&|6^kiBmqkJjcYEo{V z3yo@8{kTilW4N>?(3^Rm*YnMJ>$&n>t3kL0=|ga=w)kWEMh$oWsLN^F4e7 zx{p66PdV7P2Fq{e<@K7sSSRi^r!5&D#%9$Kl^)DRP8koyE?f*4{jg`yKW7=$o3$)f@57PB{ftyH9#G@>f zZa2cJbe(gPE{HLc3J{xs96^MH_AmTdThVduK<$zy+7&DB{L7E&1XGk!3|9iAJy&tZ zeKR0)$n;<}tnSrhiS#kJyH9I@Wt=e*@sP?Y&h9fjh$Ha~3zRu@Bq@KNUCCmzVaiGZ zg+@R63oZR>b-L)bYD0CEJpI_k>wV+Kr<~gTk)2|=*m|depxRV3?Yiw&gBeYFFV=GS zrce-M&tB}mLoM$3u6|-p9GP0!^%q3YpTmc_Wtc-NFXOJ|m?X3oVIA?cG(fvx7Qmds z3ML_}ZPK*co_@|~NJ6e-lFMB`=&a>`pO*0P^>sfu4x0efh`WCKf8Mt#&3~WWIRM}f zFtVHuDE+{qYLLAs%eJ_>R(LLed-(*T(DV^IM~*?G1lxN0wE5bA-Y^PLa;BHe%Dfm7 zB9>AJ2;Fx5qi8s%U;8)Z*>Y>NEh8+{=Ig~p%9OaP+gSsABCIbhI4mDMi#-hxR70` zX4Hb`B?#<8mQl<G0p#)2+I_N>z{F_S9;eZK8_o!~1VMq`XLV)V+L}1`|gJ1U7eLFzW`@K(`YB;G~ ztklV>97#n2$HJRB$Ni_a>`JQvH&DsJ2Ry>ZObA7FxE= z&mZM|fE)3>eo~VOl`d1NAv=Cr7R&tn(^HP`feD;XGBOP`{&gFEvu|0mv=jXz|Ibye z-5MExD=Qn#(PZ>JFuHU!O9Tu;%UG8SeQ1YuLW^g5zc;6C5+;}tU>x*SPW7W~WXE>B zz`=t|Sq5HjGLkci=(iJR5orL;|1F2(VFyj}t>3uuSkSE$%S0dX0U_j}TiK)rs#JS* zc}Y0A8bBu=Pc5|5kTL4Mm)6l&B0?QAx>8qjw=pJK9m=y|X(>iRPbM1wME0nm02ED} zl-N=muIc^z_c)7XR4Z(9=fD0M{g)M>bw-eJ{d)E4p|*_TlgbQezB{p}v`Ir6O22?O zYV*bAYi{n0D*4&t13kX1^tX@B>>ab7F?j+?QO)bZH?o@WU7CUTo7%?Tw(VI%4rdiP zg;Hwbv#!6guLlt3vvzCn0}kxIjGMrW-dR(#o%mm29s1IYV3JO4^@eoNQr$*HA!O9s zWZ#CQKtbe{C*jRFH*+J`8qW>mYQ%{^3+pFPvtR5{F?PWYNJl0ZjMlKxXt2aq7o=7* zq;r8Ze69z)7NBrK)*l?qh;M?4pUsH4a@;Pj{E(@eGTH(9V4>w+3io)%n_x>2oA&bO z9PjcG{L3+2wG`5C3r#P_0kXn}g?VQ!_U!bE`6~~EmKmWG%ntsMF%FMq_o%CBx-T@fp?giI7(i5`+4!f5%yU9Vx znA`=lFcMWEgcS1;n$VcBEE-5Xk9T^)+IR+u`_LxomYq8F!rxZ?xg%m^<|d3K z+DFQLMgbd5KCU4I*eFN|IZLUb1Gat_&ld!QNUA7RVcdHpGHG z3o!8w&y;8>_)ZUd&X^s|Ni@wXr@8b9#KHxc*vf+`{A?1BnJ(Z-8Y=a3%EU!l`_JOy zViEV@kv$HUz;~ZFPiigWWB8m|dlt*+mx~K0N=GBM9BSD3_tfG7saXl}iQ^wP8z-(t z))!EN6KCBUdbAiq_7v3-)G-!-&IIhvEWVMJQzj@dC`qxu+E_6cTc~JU&l<}dm8r!)slEEcyA}1yO#x`5??PDi7 zhNSt|uYc5g`X5cf2pSUyY36pYjSR2_z`n7+zkU0<1Il&;zO1aQxl{=v`Tw9|s$|;} z5~Q`kxA6E(c#;!LU)Z9f6)@Yk2`5;nOFw*?gybIF@Tse z7IIJO5q^q=b}FSJp`7U!15I9rqJPNE6ZpsOR>+aqD9o(bKKu8M?6_~{DmwP%Y;BH9 zP_6q^At8piOaWET>00Je__Y2mi4C@nE{x#Q!yD9RE;_F_($Z3m6J%Ax_UGb~RlcX( zs>4sK;q?zyLo;3Njl|#CRDpDq%{4RI2u~@8R6tO=ri3Qj-+A)niK5&4f-Ye&9Gt95 zGs`G=RIHlR>~$0vbthy^rLlMWLI#E9*q6XWD`P52XO;; zv8<_EsSV~cYObkmsj{lpeGWV9cp(UUi+5xkeh`ItNAS}im-d|I_5AsB3O^x$^j&V& zbiy+z{sVS;C**4)LvEcLJLXjyY#2JBH6-{b<9N*P9;`94%&yb@3(b$8Nv^rAn)dkI z>!b|g`d?vXj7NH(eI2ist$MGB^w4zF(0*j=$j@#2&Q)-9`VMGl`KSZ{G3jw=#64d| znO{YB$Mw_rsFq`EB*KGz&43Cug+%0GcEs35Z6`D{?$}f2W=NVcH=YqtcN!e8_xdX|XRzKcvgq?A z8V@VZGiO1Z!swzLCn0ScbOgk^b)UxmOY}W3IA&6X&v?$i!-h4F=3Kz|o*_Ly0UB&l zZ~vQNGoqOoHp|BkFO97l8q(QN{i2|6@hyQq9?&`}J#v|unN6>DN5}v;TKIEtW$@ z7*74%sEzU?juIxlTYvj##-Qys$l5Vxr<@j)E4Gg6A{p3gP5TS4H9Z&tVxk)RRFce* zK80{Q(tL&=^G;~&zfDBRPwZds$-m7*S?d1dXQ>Hi6u`-=el^dB?smH7GPt?bWtRGD z3KTKbl;~|lS?bue>-fB8n!b&#lz*!&Ewf@vg1XWfZawlaq>b3pxy60@d4(2rL|0`Q z9f%aCB}p02kE-Shx2mxC`MGy{t1+W3ZS?I7PmH$X49%_d{Y#srOlN}RL^lwegyto! zNhbEJK1NEcXwpq+Khfn!Ib-2@LpC7U){S?H`8Dy)rWvpoKtnl zCeMuL`hKDuCSM8Xh0J`4ZGef1aRfK`EDz+syEAbu(jc0J5+Wa7qhn3=F=wXC1%fIX z)S3e3eZmA_CU*5@Vj9lHTX!=aKW>4lcN1WUKDv>kE?)9ind8AYm-EOPy+t;WohFVC zRPeFXFzEjR>mYpR;}vaM3X?w<;KcZT*&UsJ|1IK7rtX`!Z)fHm(BF71e;(jb$~3Ox z6BN7PYao73o#35n`4>2&@G8YbKWsw!`M;()zGwgivj@`P%a_IT`={ z>TYqJB2m&gumuM;6bM~28H`^X78wQ{NlnGUZ{lX%fovv+j|NtRKUxvHAu_Bt3| zFIxOBAD4ns*e7U8l?5Q%FsLmYXO)?2=jPE+82=$R`NNZ8$1>cPo!}f9xz^6G9xy?| z5EWXtqSk*UQ}5#8q#N;XGKI)scOF!#e4|mEOS!hiJNq%qs(YAd;`})ENM{g}adGW$NHC z{6nY4$sEtOen7K3xw*NX?_Skf*sev{B`=N-iVFA`-)8ul^LJ02`3I_;!g6jp>%V1f z8riM;?F~9$*tW|vW4c#%D8huyxYyb+2!zbCI+?O^pug8IIjY1t$D(}W^c!uhvdV2) zN&2~|GIE0+&^Z0wx|~k1m0pnWl$G=wnVuNDrJ_UBh$?RWbMb?=96Y?-H6+B=&Tv$3 zjw;w>@k>QoKcPYDoEp{lK}g5-wV}s;eV2}*rvJsrf_=IpnVbF(J2_hIe~yjxmNCdL zJaCpRF#j>)YoTV`&9a|wz7>UBczShdNK#IOx^lVew!U}O^-mmaF!jW5z9GMlJ()6O zM2cZ^^@-QRZ*@}N>T@<&JKLtE^HcL9UP)h{e%O1d$8oF4`3;QrmKTI&B$Y>*94{&V zUiv;tEBjqkreS*0F=v(ANlBAX6Nao~#DM%t+v45$G~mm?zv;;>5l@b2D(kq02Opo@ ze#E#Z&P)2^zMy9$Z|k2Xx57WrQ0IB(%$d54T3RaVz<_Vc=n zjZI0EpY4S+UGl!%-0be*G44UtYka3C*|Tvv6iWSc`FDy9JvI%_dXrVCx~QZ-*OG7t zNRGnhtM9A({Jmf0k2qbf-$Z-l1-HY8{h=QHC?eRO2ZDox0odMjG#!#&9FHXPM|h&X z+#B|xn~KU9UtM>76zNMiwp@O^UlrZu?}Of)b5>ejo|u|y06H@mPf2<|37vCrk0p!;t&{477dbnp=@p~sh-atzO=&6`tcM77O8T}|Z(8ZIPY zx}UzkapXlJ`e~qDh&jqyUS@2x|8&2H$6WR6m~&C{iW>6l(7fTr3e*(U8w>tW_^>&#=M=gzFSWhxwj9os&Nw4!5rz4@l+`(Ji@kL0UVPMQ`2nkYj)KsQqR-#%+h{vAR<6#|?s^GVs$MOWpS~@k1?nA|LP1P0VrK zJ?|74)71w?)FjLt-pADkt6^5gpS(BCJqB&VKAwJ|aoFA7Wd9@k9vT`^dK-7MTHCMx zX6E?uYCX@ur!i|oplDU`A$|W*{(-qUZYt!`cVQXJrcIczZT*kZMLOz+j!b*QoSAfD z@6Gn>{^o8x;;6~#Cq~!dd!d?NpUn546CJjmvw~eR#xA;h@C1~cCAF43m`KO0Tl+bx z!Ie7Y5-NaBGK*yTB})U%PuH6%ec(i=i9owlUavV5u@)`LT~hXU=K6B0So{uU9QT%r zpWe3oz!WXNDO#l>ChFr1JU!XVf6;Nr89-Qrkt44xESK^Imt%YwkK`mdY&+pw}pwpn(jxExkMma+W69oJzvz@;oQSf|K_@0w9KY#s-cQU%|GP{_g zJh~&Dj;N5M{VCPUIG1Q8GO>=j+26}CY#=m{Cy1oP*EB=anZG-|D=V@>z#v zJusXfrHWXHm%^)+|Jrryj=Wi9Zf+8G`I(~Uuwh2fF*~>i78YmmNXvOqOOB$;3rg zJGX7?n>H}ojc>KG&Mqqmf1#4?Q-KdhX$0b?yIV+R#wGum7f#B$0h&K8Sf!H;aA%x& zFy+=kJAj=&0B;~&t?s>yW^if8;XllrjU(Z}ed={^(4%}*n|6AadN^8j%@KRAVWdc* zN&wlZ(C&qDbeqG25F5jmHqyx{IPdxS)2FG%#zz;_JuDmH#hO8Wqfw5 z{Ur>?J~)Vtp;{lk**hYL<5Isxm2)P=AwF9G>BI5cD;Iyg6Pt{Ujp?RpFH6|m+RiT~ ze7H&QGcC_vMjm~(B+4o@F+2`m@vzyH;U|_~{yu|vl*$dudiTzcI{V-tc#-so6{q5U z{#eJn%NeAv#{=#UKLzHrBq~J}-5cia=H`RgB@4ve&}1r$JNw-`mhWkpZRKxko8O3U zGU6P#EsrHOW9H0*z%-aSpYTlki7(cd2dd7W5S5d$yoP+vh^CtcH#b70g(5vpRYoIe z&*Fj%!{ZL0x0|eaRR@(h@myl8&3k>IZ?sBxp1mt_(qENX(4lB|e1SK4C^kr4@(fjN z6l6fWffeN!hfI)>oqdiP&En9jRde1LPiCgZ7Y-X)LK3lja{feISzeD~C<@bV79mel zd~})>9V?ZiDlHjtMTzPIEX`wSF~H9adwKv2^s4api^6EG_+MTX}c>JLxOYFe7%|>d1(Q zx0ljPrXg^|eNVH^u;@(yeGZtMePP~K3Jh;H<+8GT z@$3@}775@9Ly|m#Gz-E(Az8~Zke~XfODf( zC%JQZ`^UGRN)2a62Pg|}BHcVYX*dfmFM{6KOGrIU+mBkeZWHLyS@DE}4c9mpB@K!C z5Le#3Y15M=|6a2$j=^YQ-tA8shp9?6IlE0#ETZrkEIUgm1=Wng-rL>x>&SzjL9v>Y zVU(X5aE3uJ2H5p4pt}`m{`R^s`om*GQ*zSr^|P@Q9ta5uDZKo1J>xgt%C{KnaE>B` znF%Q|3Cfjsv&SEyR{=I%mYa_7`myF?(y<*Xw=>c{L~ZWbeUVitVJ`p3@;Jtk<_n0# zH)RQd-&)j7O0(;0qD!=*O2V89aWp{I!H#Pdl&h@*5M zet9x~uOFyW$BeW6Z;tKDNbJbY&{w!T!5~f>oYC>G`ZRQW06MzEyvt!R54^|KN<#iBaoew(d<;4c>!CUJ zwxT*#_qEE;HHv=#D;mUSRU>y$iXCuqaS?=!KM;TE90VUvl}&@?&N^i;g>id2WL?wq z;|lF(({LQKGzoo5MTafb-FgF{2@&SpapO+tO}8ICDq5aDVo3AD0~YOpTKXBj+qM@2 zIj4s$odtK%?S+-EV%e^M>I8eQGQk>#US)sezG%IyQ|`tZYlDr~5k6JkMFs7t+~A7v z7MljYtjpBma5xM*k^pm$bmR`R>3jj)0kor~JvaUP?_KfQtsA_~&p-0Pc zYk9lMSJ`^CwbjqQMDS#`tlQC}=Wt}+G7owFzI%-;libVo+-Rou=bsN*TltiC^bfKq zcY%49@upd?UIE>6k7%}2M7(2pWxslLg5goO6B4q`nYP5c|8Yav>gUKvvo=*1d@f$u zs_IWpb$+X=>J*BGzlT=db%mluM~kY@sOUNJLDj8L%=Erh+X}3g4A=<9xDh}kvf=IL z&h>6@|F#obuZQV(89N;Ew0AiL&D57ei5An=tclR|?9ItBeYj3t{7f zfjsQfP*4FtWlP$lka`!kbjGyc2RyW7Bjy40JtXr1c=jB%$)OC> zwb{x{_?d-EK85nQVMz_0oS8(5o+~?4UG;|W*N4rTA2J%>!Qtao2jKG{jvgjj>67Mp z_=azP0G&=Ve28-PF7VWEUc!#n8Y|D@dn``))AV3Q6P_`i5r4)pn&0Z8!`z+LoOP?h zFxy!$Z^R~2V<-N>N>pv>P-L2;=eb?zH$NU&( z22SzkdFOkr&U{Ej?i1Z2o$5|i2Qlw{_=jWb3iqo3e6tVd0c^SeCSB~~`>rG}Zr!@| zHqOBlnep-3NM0KDU(Pa$dF|HKhow%d^MlrWQ?SL3?c4h;;jsJDFxy882=#eT$>zhr z_K4#BGDw66%vw?*xcWe3CiZP4g@`M;NiDvw8^d|J=QhydPaIs(y{q29M%ql~GCvLo zOC@Bt&9Q!0KHN5E7sf%$FvxorYUI5+qepLQ6tBm~F3^~D<9IzFjf&LP{TpdlhI8O` zE9Vo;EjZ-ZUfpI4LXLSk*Q8{Nmp_zeZ5}p%-n<1+NPG6~^)zc4wdT`FmG0{5vz*Pk z!RolWG5(k~4x*ZxwGx%tpbP?~A&V%!ZSOh5`f6yTn<)`Plx%7BGG&~YpfjhLm-Cy(`ih9BS`Zg>Ask(w4iL(2O$iZ@eQyZ0fok%Miww=J2S@4B*d{=g4yKes>Sd8})lcBj_f zKd@=f!i-uJ)Tu9%j!IgEYq7P!G-%KNC14=n$Qj9z@FNAO!;PZND8847(>;7}jaIfbKU z;jVEgw+WwP|I)-S^Xb!BI6RmU)3=F#k5p5<;rzPtB9|W9G&;prO8R^Ld=7+g+5HVa z=GSe$cI=ojXNlwfvqDa5x9qGV*_~Le3ix`yHrJ~`lO}2OlLrm`0m$w}^#GL4&hT_~ zJ&92&k%%b=iaFJ&)pk>FMfmBzLdF@5VvNXHV5=pO=kSo_$0&T36qdi<*@wmGO(YTc z$J!~&3kd#_z9hJNr%t0F*i&jX89w)M+PPVe{&pxbX|B4xI@vM996A0^PT} zy=Ho+sx(*;;!v9H38ge7LqZ@F9A^WZf~+v2Kp9XH$k-U+9!(sJ4J3Cls2%>GZ921- z9`Mix;YFyqISfQA(#cZwbY)PXTu?hjXm#6qZog+f#eL(3<^cnP#nT>y&RB zjW1Xg-F`#G3TBp0kjiw!kD}vC)?1M^rU%X*B%dj!!z@=Qs?It&j>(>P0uuuxXFC=y z5kj#~<23!_tADM<H?~RN%gf4ItZ9=c;9b%e z7UMc`%Yg7|%oT>WTCj{cQ&QA_Ni{T`Q}j)-EatC2LkUenF_lp_7JrOyvIJ$q#wIE* z99h1rPR_h7$?jXFJp%m`&AoUtR+4Cfe*7oHdCw4EhcSx8N$w!~fWp5iJ5B=o}_(5~4! z07x+eaf{DhH6s4lG5vu@n~Ut&DM(Q%0;3&28BYSr%j8CK!vO08QmgAEQ(PCt3XQ8Y z$S~k1r3$h9wVo-nb0qaMGH1x~H8$^#a=u~^{vh%?WIX*~Z1L7opAiq4t=N(2IO!RN>Tp@*Ww%5$_FE4g7qRIp=xh-Eed-h@BIyu^g>OB6{iFz0j^bYl3 zI1smx%gr3BUP~$;sS;g{edV4SNQ_+T3i@lFU4kO@CK z?Cuf_KDv`9KgLE9Tr{l`1J3L;c<>B*52wNlwOjVCEsBs3Dmb4^E=gT!iIxgd-V7`3 zZx-+Tj_o?Q3;ZZGTy*x0C(`!#Mq~aX~ zR21?;Dm}qeL=!>Z!idL}=Cnaq^@m!sr?O~klK3}x7vm4&|dWP~`1oXhQ8HmsT9VBEo z2qq=uzSfcgmrA4?;m<9`dq0(?ophL3T#hQ)6^@KbjV=53X0JOL{@s1mZK$n+zd~lX zLf_e?%~`GEN{?m$vtlF!2)G1?529_IqDx1Jxw}7@UL%Vd&_r2$mIJFGw;qDyijoFk z3uCoxc+<7U%V2Pi5#)PiDqkK~YQAw0_|@Mq_!uL92f?g;XT!}7G2bl<2w?#DyBOc9 zfG@1LpJU^Yc7==pDGcIR5R?Q~=tx=)XMj12QeB*#0lafsGculoB?1Wtspn=*qshM8 z$NqEK89=D9#Op`s4VPFwLiYo7)7Zidk2aHq|MG(^vb0^{vssuP%Gfc};~du7z5u#% z<9uqSCC9P)*cWGdXdNX1)|Iq(icy#FuX#{IsjN5(;~fr{aP>*OD_V+t2KF!LN@u^i2N^5l{wLx1ONK# zTFl$dX($ha7?;wE$!C&35XgIHY(NzdXo}g$R&DCZEyCnMp$|(LUj9CWLgpd)HW2SV zB94eKmd_$C#8rHr_#A4;)dynAU)NpjFfz38Z%eckFS}GGlgN{m!A#LZlLLEaN80!z z9|gnKIg-e(8_thpecjLq;jK#uxp zE8u#OR%7KWf!mG7@7({|{;8@wenmHv9ZQ_AR+W!VMzgEfgk@g`?$E?Dyw=`6^ZjDQ z*w)q8;^9t=@xSWc+_v{C^C`Yhiu&XSp8BI}{S2qePTrMw&N@*S^6+kC?RITncN(!e zBk9=W3(7|h2c~yW0mbh2Zv3rbb%7c_e8lP_6`Duoz510W9aHG_{Mf5f31d88^{XKQ zD5~CCp_p5viVq=H*R1_NUG4tw7nFznzwe>k1;zh3N%H@tWixVr{d97yFVd+gaMBPk zoBjIrNg|Zymj~L!F2jw%I_NV^~*^$*wk@8~c1&8USPZnaS;#=_I;M zlMxl?DRJ$euBGyy3^5@X5*1!#R!G-R7mI1h)W>d)ZL=%U;`xf3?aIhz?u2;94$SG6 zU(33W?P3AQJxpuM3DTB7daM19(p=Ki078~7>0IR3c6rnE^_!!oBip|a5Fxqt`Kk=7 z!~GZTm>s?>kXcI(^M{va9T3G6T5b@+7+&yE@Tz(!HVa#hi!0+ern4diBR8jB-mpQq zB207~rC0EUq?a2oV$DZy#-aF7bNL~FRld^2iz06|?x6K7s!7ADOCEBxSM)YxUEzC0 z_TqypWrSO0oQBG|_2|(wl;!;a0RbwDu`5Z?{zBLI5^%Qp_a4D9C!sW!aIlt8+mVjQ zSg4_wn38ycNPH_HVXW&HE-pWF-3J3a(U+Dz*9al>n{Yt}+ncI}(l2UoZst^1bXvb* zA7ZAtuKek8k2EmtLv#Q^*MjcqC#yqXR~77#A7*oXImM%H?x7 z8S$`p;RbI0ebbo=%^i0y{WW&--#g~inC7a?Avha0{DfbusMS2J(WzSw6E(lR+*~?( zu#RPBOeE>s7pTq}n&Rr_Sz&g3KFfsoZYm{)p#FarpKRxq*kw;}cIR8dx;|PRZLwoxuT#|Rwx#D+N1kB56 zvQ^q>M!AY{U|txbxVpyoT@aq3sw2D&Obl|HN_VFB!Nfoo!tAZ(Pt@J|lW#3<=?q7f zq{S}qhxr?N_f_)!AGCrf<-bIQzSFmn?ei!QA`A?W9dAONHqIw#z}88YW0PAtBTDt@ zz9>)=YOC;>ddPiR9=fTi-C2sY`kI*%^WPrB_1#@1+BP(P)c)-gugb~#OsdYjcnM|V|Yw?7`}ELL??)p@5Hv!M1${ds;3Tzz^P z>9nfhwesGn8^>)wULLg=J^AT@1XtYUR5Pw||RBTY59n@cVov zUlkplpivv{Ud`RB?zW#7)d|X0)j2e|cmJ9i-S5R7etPqj)1Do@=iFR6-p$HyT4v*< zuiZb4(=hpZzu}~N9zmz#YqaZqNZXFLbz7VmIYH&Z^g8lqlasqtx(E8~<%;EERpzWz zQ|;`1;PrFuyIQ-P)+x`?NLf0*PS^~@QvcfOhFf$0S`TAR8_QgI_Z1s+`}3B zaAomhwKQegl+|=G41UAQxTajg#37UeCMF_ zWsM$N#P6PAJYZK-!#9s3EZIFnGb`}{Ook->qv6-49DD$cvMTk^y5<4+B9qv@E zm{<>}@bzZ4`J{;+Js(m+h^y#GxTg7{MfjTz+0odamwqWgM5WBkQo|?kBOGTy9N#Z; zu?qsLsDR2#G5^%Cp}pf zo`2zH`$upJU;g1~Y+@WDyay%sc3&Mkkcv-AJjy#b;ydA!jYj1M;Cj=>Mscc9{H;Gj z=LG$*vE^GbxEamZ^~2!l)fd@#tF9gOJeA&LKgzPaM`!j5U!+x9WFujgMiXJ{3?QBc zGKpO1qO?JUzMYH+5ra1Ts7Ta`@#e_XbS63B2XA|9_+dCsQi!ZDRIvWx2%Jk8pe8@S zeSG!c1(hNc);bcKD7Pf*VZ#q)CzpwyVy`p%^7>d4R1KFhB-Y8y(Nrp~y3TI18(eag zZh1Qd97ljeUb1hZf%ae>f5v|^%cnL) zSW#)k(ZMQ-{QjTuGfw&6nKdrItao9VY zGyuZNv!>_gZWJ>RBs5gjG*^ffebSL7@|aiywZF3!gkgc+m+!OM#e4&q<}J1|{C*qW z4Nwc3@?Ng`?##zoICLJ9*RgSt# zGM^?3&Hjspu+^rH*s8FH`NFO&fx*IhF%?R45XTZVZa+NYV+SRC4>#_bncH zkFv3|6XN~uchn?F2nYw~T2|lii0qj*V~uzsvKfV*aa{>f5&*T5vp&A>7ijZ5VsqHM zp;a&c(qq2PgD$c#ptmeGmScwshOspo)V%8ZRoA+tA1VZ|nLnVn14 z{CDo&O%c~lf_2qPU+Hzz)w|d5;T0Fl3nO#eOUWRM6w|<$r;Coy>ZelmW7ZQRJ2la! z=|msWCX}Vv=AZi;|NQ)!JzSE$>O_y2slOijkd(&t(;0c{Er1<*bI3pUz&T{W?l1zA za)c&+0)cpFgmyHG2=i=OfEr?%P<=m~j@)te4ut8t_;hJ$7-rsKp)QTv)fj)Ypj3`+!@^%&JHOgc)YD;YKVwudWHTC+J(+hLxV!xlhRO`NgkEI#!f!b-nr_J zbA3U>`FM)N+HSyUDLC?S11>yW>iNBt?4A@wNBN&K2S5h8{HQh|Od|Wa!qBsa8XOfS zxc%C!KMc<5jiMvM-zYpttL$y(9xt?Z)Mx8eE$DAOBod3W9f&3J6x480DuV?Eya@ZF z^SHOPWJcu&MYj!OFOLPk+zv0$CpkPqlCt{{Tat&9x4X3oz({-)tx_1PSSe45!55`o zlNvg$e8$vW)l10fNFi5lh#;=2Gexu54Qtbp;M3Tol?o_a3Z6%c)}+6^>7GBv6FNSp z=3XM1sHn^O4n?gtN^g#)nQ*$i*qwA4DkDy((0VHT2{v8Mb@ zu~~Eqlt_It3e8($d&L(u0jH8Z*&=vJTS!%La^cdO!QEHUAx0k7a*P73m?s#G>NmRD zKeOW~j#Ugn(N{9Y9e4aX%jGKovX721%E$OCIMlRgD9kIal`|ka2_|%CPJxO}rPx9# z3MAow_bu7@I^nZ-FF=S;DF|Amon>|xRM_ros+vL{FCY8) zj#rg;X=J|x7z&B96I=zRQdGF8fkCE=xtT2E z4ov0y`A9~O-~o_e%pbQ$vOo?TSgr|Sq;6igS;@YnFi_BdXutu%ah5?32a62c;kwg? z!N38afFF+*B?YxcGTgv2s3A z)Fsr~C9Q2yD$-UO%tk?_e)31G;tf%99tFyj7*W@F>7+!Au+V%jdsUi|!VhRu_caTy@zSY8lr|Mlsjz-j>TtTtHO zf4b@zdX|Q)Q%gAZtKDhFbuJ#sezrg{+2!II712MXI3zh};xB)P1Qc4GTdkWh zfvbC`-ZQd~f76>>XfkA-MuLatps8m@v_A*Z-4yoM4RSpKju_cELPc&JB>}59Uo6N3 zT0Yx3v7dubey!nUe|25mM?Kmz^64W*CD3rK^nmnjp zod(+fAiu;-gDd$fMk4vsz%cu1ud1zWooE~Nv{Lut?>lM%TNCfyd(1}vLPC8DDTVuI zUR-PZA~@im_;;$#-bT(q8VU2HNfSbs zIX|i~QH1X-S7-5(iM2w!{9{*FUC_w=^KZxhj{0cj-V?x{;t>@$R_wReJ9Vc_2|qdU*EcT!Zx}@>l<}m)11sS^oGVi|dYF2V1~H5V zwrpDU!x6qbWyvroT+^^msVpYQ)p^V_`FCvhOrz|5$bB{VHmJVGbg`_&+(NQ&7y;k) z>oF;w+4PJ&S%6n&;K)z^s{X|9OO}>ix08sfg^TY~XntZ-=EGR)1&QeoP( z9fSMu>aRC*`Be}s-l`7>|kSY8PDcYw+@@NI*tyVuQ? z=Rt)Q_#eaOUB7nO|F|IO*6r?2fXCFgHSlK{zfG%t`*8o!b*vCx!1QU~HF#E>qWrD z9ylfmhe8_&yF!Ft@}pM3Fmpqe(o4@1Y|391aK@LTs^uubehT&jA? z86Tf+nEGWP8l--(ONaF{A$3xj5E9DyWsGbxpw~1i=YI9m4_BJFISjU4S|gF2`P_;IC3{(b1S=llHS%fMQYNPml&zE_sc#YU#;%xSoLc`SuK#2H)FNQwKeyyEPP}AtB zZ#;>ab@%Q;<->6g<57Im*Ja|1T&%sKw}rIhR5+ZrUS7Q_sams^cQ~8b&9-5XSxa4- zAS-(od3kJTf-W)T{`a(-M)7?!{zLKRPNe0dzKv8FjOI$zYj89ksV(U%w4`#YR`2?{ zNKZ0@Z43IkVBd97S5Z|}9_oUfNk<ABfq&AT_g)u zD>sFYHZ#)!jfnNHszX`Wzft_~`?yTI#rLJvZ~0fVmd^2F-eo#OX^!55euv%NRo{KS zX_2X(r8%yT=9C5+Cd==)HRYSQcoV(4Zms-rUMUT~Z(+-% zVJ?+Fo~3)!qOj5itU1MI@7{^I`$}5w-nQ-A#G4jo_nXBfmiG4i);l-)=m(qaA8g_h z=eOPSYEAg>qRTwIjqmw=>uv}k$+~5_uE`VE-2UyY&vkue6`fgOt|%_=j*j0wBKPHn zz4z=>aYH*AbxSTUc+9AoGF1|7^U;0i%qwRxN zuIkf0`Qbx@|I`r$Ni?6>F|Bm;XX|tj6^CopiF!L-cihOX3DGVG9b48YQHxr6_UWDm zO9Kh0M+ySGID=)V-=wwCN3GOvj$d1{^iucbo{P304cj|$az^m<$r(xNXXZTYe8y_p zw&!gIFyysj>)HqV`x@^v|^4!-qT#d#P z?fY;nDrjVi7!<$#l_ZY2#n`N7G+I4}G?D*+lX4&98Y{*HtFiRHW;bM`6EXj#$y$Tr z3J;yZ$uZk|JkDKa8otbnHBBfFDlVwPyDwXqCEoj$=YCJdfiy9g?CpthGA z1_#hTlUfsGeGG7P5&$av!`m~=<1eU>y6i2w&cuVsfWj%XK!bfC00IvsNHZ(2Ar}q# z3dQWY87wGkyZQXirmChYop{*+sdYicjw)XtRkkerbDVB-M_ZvVYjY&X924x7tev4i zx2Y&Zk*OtFuirc_5{|g7_Ov~zN$g*!gw;lxUi>~vc_SckM_0 zqToa)Ii!mO4LGhYi?Sl%53yfoIeyn&a-6^}n&HpELq$Bc9%(_*^=(8CWdhMPwUktU zK%uDwZ!p&J0d&+d)~%}99oRDYHk=B>XCV)fyZa&}`zaU|+`uH2iUo^;=A&4JoE2jJ zQiJztWYESUDFg6KQjegzHLF)sqpAD`65XN>RIivDsS5H0%EGQyiT|XahTAO`{k}`< z-GpW-_*jN)LdPyY(l9m>iaF@;wBKvV8e2QVruUp-vL9 zRqY_3NSg{&rcHJ1%}$9ESWa~6BLzPeSiT9ql(;*L!HhH0mUa~ob4b7DRwUH#^B!%L3QmrG;GNh{ep+D zpkCWiVE1gi|D8PrSRvcgsj9uxL%Ii&j`l5`%`cpv;3E2(oT527V@>U&Kb_PUq^g#5 z8ksk;j~#lbRw{0()fGD`>-7*Wtb;b&>c~cbG$cwvm^N)1QMHPaEoUM#siS&>}v>31DQg37MpzT;BUb0#XPI#vyqfzy+az+ZWkAH*m)B1f=r}xVid2cqD{jR|#Q7lY;h)0x}@q zy9dU%mD zIPu@nq=LrR1?hxGSi#tU^VmInZt;~X_u#@PM*+3NGBVyViG@=5M=#F~YL|>6gDrn{ zHv2EW+lj=Ib~VL^diXa`WF3?+1WpIMw@9nQUMc~1)w91om95%n$tsZ{05V0iAsf1L za-$->H9h3quM>a2I%KIwQPU1ok>LAvX>}boeP1iK6A=V^Yp*k|;jiG>v|1=PF`+_+ z*Z`2)M-8emWC4^nv=&>E_@zIWW=@*{b2cw8DNsaV<|augRBps%oPc-#7fc#) ze19;c$3lnZq$sO_ta*f88xXb$hU(GNLZ-usCXU~0I(J2kqJduTZyFctvKOAZSV%@@ zKEk1h##6M&=O14f56A<2SwhT83;ykYs+!+G2#(dbkUvnJY9x=C@$^}<((L~noN~U$ zzUKY;Z7$AY|3@#5`}zM@tutFKWq>RTbyTx`e<2jV zrLFCLW8>j*kb2Lal}R7W(-2N~0KX@=pzX=}#!I9CXyJe7ZZ{o4B}!!< zW!az;jIM9q{w1Aw**Q zCm9)?Km`Q?`|QUpyuUwSz4bORGG{ZR_xm{kywfoD=FMidKS`Dh@B%_AulI;NE7i@*1t+!>mY7gAA#5P_!| zPs|83tw`UA!KuCB+x&keLDqvKy}2KK+I_TsRDc3`H89r)Uj{`FnG8t)RIdlqJ$cCB z=Kh5w7k9(f_aqW8;kxhINRvSCioff%D|%y|@-pj>Y`L8Pw^S^DX~yr|F-a;R55^}cciAGw33C418)R7@vOHuK;zKqgzhmh}o_oDacF8X|m{pAz4S3s(~_n0#yk5qNT^Y@%76Rk_y+y-gH_pvo*T)&6y$_ z#TtK^ruSv0q21<@pN$|qt9f(AdP7*AkX7_beDP{C>A9YQ_!tDO?Y!YD80FVXDA?5dqu+^UoC(A8wu+NnLde7*KYT49a9mE#TDQjMT)8$173 zCx#w>Nf}XTN_ih_Rm-#4vUXilN~}Xov22iEa_rU8(2gg=-%hrU9p6~_GNJjrb84(z zQ}4&fU|I2D$teT4v?{54y7;d{Z^=E~r6J0{^0z*iQ7cFx zmaLtn??bf7j1TEPWmZD$p&6f3%UiA8g(xdHgkkOvoFedh7xkK>ZNuqQ--LI0t>u>; z|zaT+DIH4wmKM9_I0uTS}k5HU2?1;_tF}ZdZmAt@y!923{ z-5fVk9&9J_PVNEcU%;koN9^KXtq}O1YEJ*@zzaJ#+03ZmOJrNR$;2;y4vQ_#-}hDN zcK{g(9U@*Do}Iam|0u`OQjg3_;Ym@djp|MaXhv8K_+Ky~MLOSmRr)mNf3|N=e)y0= z>o9Ek%aa*@$mMMccn%b~g~~E$@5k4q#Q1lD;h>>zI++AxAhwy=$4r4Dwy>%NK-Is& zKAZnM8-IKPFr@ztdGE^u|4Ze)jOic#duZkKQM5M$3#i2$Dj9d!Ff01ekxXS^Y=JBJi ze&4fG7iY(WbpSuB0BhcsWClWUW|~9uWHSiUNO<#y05rk(5bK13BN5_Nj2vE}KZf^3 z9iOik^!^DfVX#N)K!5vS-6cSBuCW;zqK=&G{Shoc-e} zd%S|HhVJKFi-?1c*>UHeBmQ58BmM8>`}_ZnL)PVpAZkI1iKHqzs92-*94#W4915S^ zWRd&vr7!@R{KJVh1*ASGapd=W$f}Ejh?{m$f)B=tSO0H0t!34o zJvMy@{l!9Gdv-(VcPW{9My%YtbJn>n%Bj`d3(wsh^TCh?5f|q;uJsPTOUd>HNZJ8F zRe=aNo8)`Uy`UGm-JF5)-X%Awu1E1F|u_u%r|kG2|;W??qHAna!~?-b_m;;TravHvlV02|6x-i@ely zyZD^2MX9RCwd=AO4ZCL)Y>d5?c)c?xK61Ez;h2|p?c2L@dzbaR%o5x>-z8`6_|-J} zpF)!_-YEfLyWRAgf(US+*kMh_-4IR=LU?g>A>s$U`k+GdNlQzs!K~T_q#@YV*pmfi z%9J4FK1PdQVjp+?_$|L_<+dBPJNWA?^M8TZh8D8abg{JMbAi@Z6bB77WIbgvAuhI( z=Vdg6Z0OIx@f|uu41U7FuJ-m=94g697={7wn||sCWY7T?UR?)^9s-cQ;F}QSjAdt* zIt#i160e%V%KEyxy4nito9|CC@w0AqepWS|FKzI`{j+tw2RF~1apIub9%qC1GK|N3 z<6ucW*Jxj&v+ikUPzqR{o03P6GISC$K268ssHa8HjVGe_)O|3D4@9{=CsVZ_ojj8- zm4L(P4qmDkrcexxRYB|ElrKqMWXL&o7ECKn=+9n;?lM6CT#e}f$P)-s(oLN}mo1bp zgkZN7m{Qg!UNR}s#2#gpUAr>df-#mN0Y;Hq9q7REF|spEv#`<>S|8!igfi%}m*Mb0 z&e&TlJd4!%YmHCkyF5UA1Z7avECRL<67NjDBmxGD`A;J31IE@to^~`zF>p!T?121b z_F2}02^1-d4eQttMNgv5K9?_vMM{!em4a#46MLVCh@8cI8o!{RAkroA^0p>*A=5?` z>1Be-iJ9GAB+!M9Ca5K!@?$~1d}+5LJp~M^cvCQ^CjtD{t*zST&P+ z%{!v^8hPnRINa$j*A2@SG)-lYTgwf0lZ*Ev5bHPuPT?_PHxhAChU-EZM*IG z6|@$if%J(y3U?B!3RtBnda zq(I=&dMc{=E#^)9yl?nr#$;Hqt>0>ImmqhzyZ9%zPhDLW5QsiHsd;<9IX{zByieNV z?*x#w2H>wlzNBSh7#iQFnYFUTnEG#$({JHM7q2 zs!>=9o}^X288mVA9vXCy=^9OGZ0aEW~%(sB|vyogHMt{Hdm$9=1eMd@kF zcdoX0x!gGDy}879sdTA;PUpt+Q^@k@1c)7(-<5z{?xGve`(gE(bld=++mK>z`EC&) zR`?dM=bfBc*HQB(Cerjk-viOIio10MGK(}H4F{)uK3Zk;(&XR=W#xHlruR2DXqht( z^Za>X*|aR$RKzsmS~*m7t{}$q%(c^5d(Db;8%MMNfVE|`+*t4U_!gnl7`s=kgTBV` zy;Xs3+^CkH=zd}SQ}$BOcV;L2!kZeD)~e@Sue~&JJAmB8SXq=;poj21pFUZ>x61H^ z(HTuU*_?2y?_@%CWLcaz29poQUX?nYKZ-(;0&$>PahKw_ZKme-M%v(Tg-VIcg5$Pl z--lb-9Qcv#=+bX>LpNll+M)GT#uL8}{E6h!Sq~i_Rk;WhLNHe~d?_?XK&QfF4DUF} zLQ(eGXVaEZK_R;M_crXdYZf2r-f^4Da)mE9^NR7eh@`!0eVtdQw>df}J6WyWufM3E z(tlJs{C&ywo4-D)`)f@OP(7g;l)A9A^3|2Ed{wXVGc9K^MrZzx6LD#8Mt(Co4a(nP zn=ylIU+}(I==N=$t5%}GVg96Joz#gpf(^3JOp&*YL~4`={9(?f@VB99zdqc9 zTM=y`VcdIn`u#&y%7qzEVopQ_=X}cWPdZUP>TPeU`vly?kXdp|uk^uEV zUiyiC!-~eoHiu$bc%A8AhlhrU>l-vjDHtNJGCY3&W*PrT%xq zYgzXbFQf7Y!NYG+40$=GIL_7pV^q@VeFiFh1i`K^& zB^c|~jc_NLG{89h*kF8vb(w%+I`?6mVeN!f&nDZ9*itk}>X?mSRfqkZv?s>!pfMvs zdE=7ClbEQjg3oqCFU{U|0vt?n&`1U|{`dq9%K8bv9e>Wg2E4Ud4r!=T4b1`Vy~UA3 zX{GM9_5C=j7Nmmx+1yvTpQaojTWWM*FJ_7y{4EU^kwalc(cGxJ2u|?w_I_g8L1E}v z?!*$-H-G!KoMeHgqcPjZPY}&wuE7xO;%CVIg`h!=EKsATF8LswlI8Mn$w9>b76&mYVa%GU8{rB!CuZ@ z5m!zmlFlc*?v=;_J&_91JBFHDFuYobF+vsHr3zPJLp#*fta_frqLIQ9CUIDQy>Tn= z!NQ2pI1GDq9t|3pE|D2Vw(40R7aRJsDZTNUIEiUpVvTX|lGOp~QO6;o4^SP(pk^5C z&+2`%lg(Ut|8U7#VTxQB9p(imssCP1<)E-b8si(p0~OL;ePH&4Ea z^dvDkTuvw=_0{&|Rs(9v8M}fXzw@`3=@bvK6o7u9s7Hnzh8E;iC{0J?vN%4p@*DE6 zpio9NxswU$N*JUK(v`@eE~RAwwLkF+T0i&DE%6-JGe1^DLqXBuFuF%0e2x4J!h1vF zV3Hz@%*;ph4q)cNTQvR?hA5?s&{`CxAgoWKA0dYl1f9q9Smxps!&LGK`Vjs)(b43^ zF@8Wba=}dp^1zoUX+XCWZ?bpO4_DF>1LsO!i91QURsbGb5VH3$H=FeG(1L>sQ9$oR z!RzbH>#?G~&+oIrC!e7>&Ijro!ORPUtb94pjyMuPN0#}wP@0kys(Q0KJ`>`HhcfiR z6q>Ib{~)ucL63^k`bkZNNI#K3U%%QV-(z9O`HufU7i~SL67EmdR4Yn%4chE?q%NKBi5BRK5TGs0G@sqfSi_DvOnd>+=!-!s4W_0fwGK=$--^uzOh-!+S&n&S7khWjxmT_k?nx`uwxf9|R8&B@Z-O zI+RL@;GaIcYT6_Hcwb$P2&`VMMxHGyIMJL-f2#weM(-dQf`y)M2tzegknpC5Bm&ix!o3!1etlZWpop z@Im1g92}f-rdUMHsgaEaz=FnTA+{91e5n(%u$%{- zusqY870Ya(x+=zGD#|!Pdp&*lni(JrY%)>M_^oK*F~Yltla17e=Z1f5w;DoaEYzf9 z9#W0S$fcq!T>a0gNOJUQ(76vS>`6HOfuo8g3ByrA9}{p2Y3y3PC9eYzoq!o_G zL9z2iw{X*5nWx1P*U|6$Gtv!6WQL0)$qe~4S68^t+bWSifh1oxoF`1EK z6qnEZmrDDYp?TPw5nP1+Rq!jWfC7nD>i7s!8;L+7%nPQ(a(n@3SHY%DU*S^wcMK)o zlj8pWP-@>?5rOy+d_Nj@p{7YOe{}YiRkdpnqg`@)&Cn)@{h>fA>W32yN!;+D5!GP& z^7-=(!zshC-tfD{jRX+QV~Z~tY$yUD$v1D_O^ca`$xip?)BmHAZ;fhE-b&K? z$gqD#X8)!t!99Y3O#h?1w@>ehOnOOQ{=u-Gqyu8ypHib@rJ1 z2`Sr1lRp1oX&oYtHg2&@?*lO|{CM>`5RDjw)Y^Z0@UjwbB01a9ktzg>>!2oc9w-a zx*1n%u8P#D-OHSRt7HF%E7qB&NN?B9xC-OKoB=h9FWn@Q4UeWSXq$?zTT~kN*a^H&n)2~kq`4yaaBHfZA zZvQ34tl$Lv13^yf#E)w(4Bb*^J6Q9!Y^>BK`kHR>#4%INo`SD=%F<36KMiMg!e9_* zvgZl@tIJZ{^W=iVp1QBsq}6*0)(x31GS+MrvNCS9sMbB_oZ9~L-P^eO(2h1e8~O|# z_^RGlKsBZBb!dv+>-fy9=pQzm*)$e-F?q#3-ABElR#RuC#M&G>o~)}gma}P~G=KMl zS+$h)LIYvIjFGL4RLRh~Q@k;axEtK^Fnvt&aqpb6os{uh*)%$6Pgz(|6fTszwa5$OKK;A7RV5O z40;jurm|5NM)aO@;4`r#(7+9 zhMk0M`tX|Xni2#vk7h#~2gTFHcaZ$dS1>aP4!kQLeeCbk@fo7H>bj4d^( z|7DDgvo~Ns7Mw#$)N*ASFp)j_%S8DN-45VRC!rW^Ho0@>4th4(_wb=Kr{Zt#V;i8M zafm@bQ!D^bt{Qwb4;PpGtI3SVLkst?(9kXvopu zig76Ra3PX(38wrV1&6pPpOPZ>P%9v0aKY%<7F#EZS~~$@l|7&b3f9NAa{{CoSrU*d z(>;)L0$obqwXBF01F!GEIm#47*e1!oaxg6{@4_{;68Q*(t@)q=&&#>w=H9Y=PdZ9Y z$B{(sw36_1scWE*2@78Y3ThfNK!$?I;Mf{cgCSFpBC`e;t@I}V z1*k7olhjQv3pp!~pnc*j?EAmjC4X4L2%9QWwH0Xp$T3-#WnKNi_YX09p(qWkn=*q> zP?&t2J6QQdntVa$0XMF3MrHRmWpvU0reS` zmo%28xC10lHB2@XXNu^C-o%{Kcm!d*(=i4Vu6?uzO-WgJJ4q>vAVdl&#tYMm`0~<$ zjKYyGWKyI>1LL3;gixRtDygCb2QZc4A}(5$c6BA@W`QsZA(NxyB#!Fs zk;TR$Ej@D6$RA36EYT`diFcr8CT+eLf>p_jKodjp1~l`a(`Dqwd_qH6LqI^F z(Y5GD0#9}H!6{}W7NC%BY#=8QAw|YSI8~|PnlW#+A_`mEXj@Cmam@iROH$m8jSl1x zJV4KpTX>hqcjd)c&uZ`yeL$qW(|d8B0{|KVlohwCBRPO(0bxl?Va6DG^PF61qDlBR z+QcDi>^|iKQf-ZS;{oBQal>}(z}pceCZOWsomI16v_I2n#Jc;ZNl zwO8N2DSzj?{YPePmoba3>^pa&KghZ`A73BzVXD9tCWnfP>k^j8NmJFLc5rgAeih;8 zx)sqn>q2xG|Ai%CDg0SGIaTfbPO1t_nkZT+H$l|Z^=Ps~!-<>K!@t679==`Y9s>~is3_m6*A!hJXbM#$fP{?E7k zzjDWlRAd;x?|($*^LuQJTIS3<^#%4$o~?G0<(M-=f1>=A?wn6olh4ZEt0Hs5vQ$TO z>z(o&*1U>FeB5TgzeN3{sGrJF<4ISYt7q-$pChzzPkNt-^g^>27U$JsiZmU}PFiwq z?>+LlUEAiLKUS~f{?+I&#=36|ZfKwOj2S$6uiThl^uXgMM|N>;S7fT?FJmv{|F^Xw zqJKk0$W`geJn_`Ls`3Ysd7^1*T-=gYilL5M40c3K`Nv05ROj#1`;e5ViZ2mKV{$L@ zx1OrL!;3l7I%o(XYj!krY1nJu;~Tl6o@&=sl-)~UW0XC-&ov-i*S5f+ zH{{;g8^VD{@v25FdAgCEF5FUh)hZ4|o84{2Cxi5bFww|Eo1{c-~0Nup~7Kd?3B3Q zw9e0T_NnBNu7;4QQnhEr_T`DT&#BfC{c4(~q5N}1N%)eK+UDx^ow%1b=F8Gzj(P!d z^Rl_^sZ(;KhVRC);ETh^dND3xum$WdR8+iX`e1Q4$jB5lA;L%i*mZUPx^?T2b;m4( zPFjhZoq+^xvuO20&aGRVe$8HQUcajL1Fi&ONy|ZiQ8LO`j~_aCFboinA6)!MJ1twhm94FQ#=4#j8jAXW_MmjYsB+G zxADdD;6Yy3vc8 zIP0I-&H_((F*I~Bkjkly9XogCp+uR48w@-_KtyCJ%8Ermu#MOD2g5;Xtyc%Uj2}zv z*s+YXSm*cE^6{LiqL^5_(_Sp;^GV@3w#h|(yz@u*Da0^-4Xz&FlD1u>Geb;jczF1- zxibQ5egLg4YyvQ80H#)H;PuTwX{%;-N{LJvLA5S<_WR-VdJH|nr<170I!=2D8GchY ziXLy&47IJ2Ah0GvsKYpL(Y~_V3;~})qnt(ok&NnO(`wJ{h)3O_u$#9I3M|S1V(9rzd ztMCElg4FRXMD2PI@{DrBNT5Z^(clqaNUAj+LX-|?S7y&uolX!sPw(YCgp7a<3Wytp zC^TMxh`O=}ziIyW{{G26e!EKAvsT@3w8nb!^W(wt0+?@|iU_O;AdFZUj?g3;mq)P5 zj`Z#yOy-^SQrPb4;lcj++=vXkAh%m8w%0(5Xkch~{Wx<515A?fvLX5yV`@QkJ&dgb}UP6&F6Tz<7qCa9@6IS@TTahLH zH*UB>$q^2xL*OPm%kWYYBjd@xs`oz?4iMBj7(DrSm+=PE8(8aJvqN#FMyP!Txv!MA z<%#7dROm51k(zomSgb`7bw4}532!?dAqOw72beGcY3X?ge=9Qyz3q4wz_@{h#Wf^o zEFgbDbPG!G(>=|*D--n>ql7Cvbl?Gi#mA0=t2;krd4o6)M@-NuD=Wl{bAiXdh)IiW zn~gA$ym^!R)#kN$p{!fWp0?T@-?EkU=!~aD?Pm(4pU+~T26n~r{Zczxa}VTa+l>}U z6#sYkXCneDL}9xX+sv6t2%{@E{Nfl7n|RbF!vblYISe?^LtDzqgTW+u zImWNNUUT!f6kRcD@(rrZ5}70TzU zLNZ5$a9Bj-ZJa@GCVR1z(JpMU)=VzD2ag^V4@kmHejfLA0t0K|3UuNv8C~$UU%<P+y;!E6I}v+XRk@Mzmf^sPb7Da1Rw)`wErMyPUjaVYcSc1d~w>!{Nf_$!Qo9%6XH` z{VEqH=RBWKyC|2qX}uROTv!D`vwF1Z$Mj$J{#UNdSiXFDx1*Rxw4k)K^wZa`>153V zJCuY?qv-a=>Q-665b&d{L0riXXpffQfX6u`<88yfU;_tJ&;UsN)aldnK>4sRx_-6G z>ITH{92~k<+v-!6#d*_XCl7DEPo5H9#c1R4(yMm?C2>B^$T;%cgH!nH-q^QG)~w;i znUW8+K0i))_IXE`*@X|kN%ug_!S1ILmHbBB=aCxC1LoxAoeJgvbx#=9_dC#|divno zw!#S6`03LGWhMs~*OBjmAbQv1fwN%g;F!OIbengW*~uBKTXR=cBnf)8;7xX1NWKbO zb9^SIVwzcpBFu;z7*#&HN!o%QBe#6)p|p(5C+yLA=@so7!aE`n*7;KUvIgFTRGTV-H73^Ss55R zaytX$+oUN|+EeUO698&Fw&}Q4Za8nFk~yV41k0RZV{pu!d;HWpvr-u`KPb#iMnEX)3-yShFGdrVECrE8bfhYF? z4o*&h37WXJX+RR)0BC<#P)A#A^(F2DE4YK- z%laV^07A5S3>TCNR7fEsTte?NHh#k5|n#*vaJ4ZDV;!h#IX6OX?Gcn>eHXE?pQyVVP$pbwo0<}@Gs z^mmBI=bjxyftNY@s~zHi-1QBP6ETF|^j#8*MONP&T9xq2%-LE={hE5!_)c&%7be6q zC=G6BeCx);TZQw;aR3L@u_A<%ue7Ln$cj3J<$FWHWiV*ZPzXE#9SSn&DauTUDf00g zDH^(EQ_x(fC^e~jnlyRxC-^7C)E5I7lR^b`4O{#xs}9VjVF7B0x=IY?m0^JoURRIk z;PFQ66$7t`^fZ5cfJ{<>E`YI=fWnkJZM1H6Az^Ck*+>Z)89uZ%@*o)LGX&n`>C|We zj~NZ!=7$S&F!1;wrjv=v;}Am2;xS$yt^srAK^LnGEQ#5oXb%if)@`u-G8HO!-Cf6b)2}z0K`*|#Fn}O| zPJ<=SUexB+Xo-D2RS^GE{k*+5w=XX6K9`wY@l8lhJkV)=t!%kr#rqvYR;`&v(AH0+ ziJ>?X3#MyP6A2ja87fFg5H^wb?&ZCFIR#x#K2l6}JKBt2uHGuDv5mZvAGxZmL)Uy~ zL506HV{+5+lFZ?fn7LeB7kqqtHq}8qKqDfHG(+jT6dB{&llqR}HHl|$yp`_3<$LHU z?xL1?-+9oWVfT-p5tQ|k_^Mj4dHb^C^`xOg3hcg#TXCc-u zDJ^X~qC`HVLj7Qfk~~n#WDfkz*N{MS1UYsL7nJC8`eHaZ_sQn}1*YlQ8+1!Y(U&8) zI=~}(`H_e#(F_v(?Vric?iv*(MA>uonReVT^eUjPIa4brEWBjx+S%wXW)>Aal(odg z*>@!9WqHBWdKq&^jrT?@cwIW=+kwjXDOl-M7{9p-VSvYT@C9)?>0AUmjCXdJn9Pm+ z)(sx-sc7Pv4I8no&f~i>xmjge-<&O?^rWq#CS6f7Ot}L1C{q1nrXq8Y4_2c7}PwL!;>^a;Ve1GZLr!WR6M{7dca28#?;(+HVQjWOo5# zY~=jZ9e(x=XBCHh7`(AEeFS$vUU9m)XdWK>Y_mozmK$m@ikFlcf7*^T9<|u_=Hm_R zYs`xBW5aVq{8a9A{0&Gtd%=MvN>KPnvxlr*_F}wi_ILx-r{YRZKH>|I(5Hs090UgH zth~c@pf)0)c`!ynpD;2_fQp&S?gH@z>$IlYWi3Vk>O6SwB_YR-)63Gb82c8ai5V5Q z?WcRkZTXUb6hL8aGe9+G(X^h10T6mQIgbl7rWDKp*aX!~Ba{rnSA9I98&7JUQ54$V zPH3Fh&dN}|r@Xcj7Vk8p4=R?cT0YBT&OGno!RFyDz`-LASj51{$PJ{>GwjPj>n+y3 z>qMQ3;%5>GCsT6JQZ12@ktx0R;nga`uI<1RAWHODeC{U;Q*d5=4lak`0tCLT@_4{v zbT*2&9`FrQ{U*KBb#T@o4F*y6t6$eGpy&+h~Qg}k_cxUj$rqv>1!~1i7KBtLtIB9}I?D zgsGkwMoFGW)Abbl7|11|g;%#CIpZlvfG^G%`stIF_2b2SnL2to8FFd0W1BBKbsrO+ zY_>P=$*7oXw#mg3siHC`1+fTj>3uTeTL@3i!@=sM`=%kFjT7gyC+1_$MI)Gxt~m+w zp|fnj4=H%hS<#5$wkG<(P~KtHg8!MOtz8$v{-nO3W8_`R32u1>1<5mQ>h!uvO77D; zy%vq+-jLz-;Wj&VOvO=q3H9IxX=wwP1(6sm!3f(mvtGu$GS%NuMv7f_>?amEA%CqF+w11GAz6P6Vhz2K;kR$<|7a{8tIp&y-c1R7#& zEAFwe;?Og?y3gua)OiGW9ZHVDPz({s#HezWuo zr2N9dFN}w9H0Mdae4dT7P2t#GAcHkMXLWPV=_Wv|n-8tqsljd|N2q%zm&RcEsm7?k z12jW8D%zO&n{ClJpvWq0n!nz>RZxaK{{V~w>BHYYDm;JjqPcbv17$asJS)G+hh%7L z?{wS_4z^~hg5TYtto-^>mVCGzq;pM{XuQoEvdwyVaeFG1vt8ZX=G``lgUS^EW_6~PirI}YKtypOJZFP-t55y$l7SbPcdlp6!XS&6 z&fGl4L#8sSg@8i%tmGg?E$>M<5bM9`xQ>$2WV{8IW z-ThOfBk4s+;mid6#|aHD=iIsO*c>yV)P4tXUU`pBmc0~=Ta6z-o<~~zLVTI;@J)M- ze(p>0_ea+Y^xmxF+hrD2mwhK~-st1vw>5NNdqn?4trK+>#94UB~>3kACsP!2Vh4GXjz~s9jY~i*Y*d{-HD8B#10O=jt>3*jR3%0C$w590gMJ#}VCe&e{kup~F$O*|n)=UMeL(qXW!3$d-?jrYqYBX%ZiF1#qL&vFqU6zoetC#c-SSC>sF?*^UDDFl zgRr~=WUdjnK2q2@ky?PdkFebNI%?OJe>gw;fWp_}=mz{i+J~Inq zbAAw;ACb4U#g3Po`y%SF)T;5kz(&A zv>dBhw6(Qwf7x|T({`7N$}^xXW%u?j9z3zoUeWeemA!j9DGFb>iT0ym$;bY_Dh9=(q*cbx`|w{I`~lju|Lh)2}>vvJt(lU;y$5 zf4p86O2^gN-d5jCu5Z|KRe_sdJPk5JWt8~#QZr#>Ke_N8E9w&3thu9+ zRQm&tW8pSaZ_~+ktZB8GGG)r`?!TjPA-maOkDLUkGK8Ip zm45gjTUssJ@?Dv6FvXY?Y|V9(q2r;Z*pl~$5c+;=&md@c%Ow?ML*?jfs3$tN1sT}@ zQ3bok5F^(JwPil3Rl^^nj(YX#RW9C>Q|(nNz#h=%JC%)Xb)cZVO-U*NUeeZU??E>n z0Y$h{>Bg4ttsvIF49f7hszfoJOU}s@ss7jKTw(IVxpC+L-|S2Q;mrm+5baddcGD$u6+`A2A9=hmVa-DHvCXn|IC>)?L8|pD*=U%N^j|c ze8RnCzZs6DpRhdj)?)PdJQNx$RhQS$1n#hY^hBplpMGuYU}e=aA!lyBsI2TFoFYG= zU?@k*4I>vfgROpVKer`XVwt{Ki)EkVs7u1Do~5YWBc`&ztKj+LV`D-Yvy_fS?A(1W z>-Qn_+9s4cnWIQR zerY2h<83IGIQjUzXs-kUE#rNTaVT~`>(HUE#eP#5TP1?-yl+!qzTx`Sl~Tn?znFZS z*HYenv`DD_b3>zXLyeUFWhZDSY>R$V^=+~Y*W&!S`;STp|L$|d9=%@YCVw6Y2Zo3O zC;kdeo+Kc^JJWRogMvO|V-MWoEDL=nMpjF)FWSV5V&{Qg9he@2To)D@@q5RnM<2T+ zl@-5gWQWY%MZ2~yJkDy9bK*)H54%L}@7G`bMxgPX)k|?4oSwzaxvpw*Nma0OOf>l1 zO6_2el$4ZB+#4aB*x@*P)-}$ZVM(bHcch%^V!I^4qPsCr$KCLJydZ5gBt5TRAkX$`EQBTi6>4;f3RE(6}`*)^zzQ`z({d?ixwl-5`Vxhur-q-LJM}P4gj)0MufH0vIkz*Ld*e70?K8%6 zCFbyNCYpY{I%8J~&mA1`#NzO-GYt$rI+1}La97Z}8Gu0YiLydP%$)v^y)o%QgkOV! zq4epQ;SqV&8v1{y%!L+Cp!>1!WhRui)8(eNzT{`Qzo1FSkmHaU8<)b1u>kMYm)IXo9ynqi2DB7+ApM+8=t$hL0IMPT zpK}^;Be6xP4c1p*t3UE8``qkxlOJhaL7nupqL{_sE3V<|uE4F+g^GOpHpZRj;tSmB z#=e82Oy09-ff38JlYx2cCq=^i3RRBEd&V6-HhFoN%j9FPGD4yaSf*(O3Je8`P2Hg? zps>;;{1xZLd26MQ*sq(sV)iRnu~Oa}WpX!7*qb?AUK_4b5KiHf8qMIm1`k@^vu8`H zP3*L^_@Fp%Y}H^@iJCWiiMUPtjXvElmpRNb`K6*PJAA_2W}EBHe?BR(SG0 zOfblh+`%a7o~Lk0<9yw$ijOE={8aqfyW9npr_Pw+cDy4)RWy=oB>2ndp#yS3J--ar zPM*Q9C~)Le2j66oZNiG@4Syys-#XoRHRqtk1_V-vtBR-;I}P?cR}F06julV6llMC18YkRaJ8ax*E3IJ;QsB0)-o~caXD81(mVi!U08*FOzwJ+OUx?{>D$Nzj8sJ ziz_!{p7ngFo9#G{(Fi%}GhdE<%8gPkt@$rZ80FsHq0V9|$hePI4WEFna;MDf>z~TX zSGQuqj>uPzR@Upc>zle3m3w3X{?+#vAfuEC<2 zVq#)K)hx&$(o|U8Uds7Ztkt#?Eb5QWCtVS69sl0rr%szDX@40gQUqwcQ#Ll2+}v2P z65PNfp)T2-*a1Z8J3Mx?aJ0CCeFf_Y2;~ZDz)vt4+?6qlnEL{31aYI(NTH*~3+p2& zsz22E?!iQ;!*7i3!BAM44~{*%>od;myoUzAy$qB!V7ZAlHa0-_-4YWe08IwAx2!`Y z1jqAsoUi-B!8fyh)+wV2`DO0*{Ud%S6n;;&XlQB zr{?aI$eN_gJcaQWFfl)?tD8?n43dOPWDyFu6~-Hb6DK_XhqdMo9ZrjVCfX({7l=|t zMNF4ODhwGjaAqI7x~{=z1DvuE&Q~RzEIR?yZR|h|*mMV(Tz;2(nJU=-B~;e~!5{<%r1zg|V%R7qj1=BQNCw zehv5>7KyB0E}*KXM}D0lBpB4Jt#8(3MG59J#}<1#JE8=kT%zTL_XS1rNQ0@KAChF||CV;^dGGq)TvV(V1CjW{NS_W ztD7Oe<6=AqoMGKoQBkoCPJ~Ge0KO08gI93QLm^vUPGLw!d)DgK)Lv|Be&DQf(b6IczX@@__exWB#t;>RblMnZXEJh zXC9ev81OjzjLmrk|6$$eH`~)NBtAU{G4@J^2b?2ql@=Im$>2b{UsbnkXp~s*^cU3# zg#FmmBm}Rt>36*g(UCuVIU>|ZhizP!x&lHAsth5*p!JBqTvHd{4#`P`g?Eh zjrQWAqAvnrH2GZm1@K2wcJ8{lniVUy!ux4aQC!kJW%awWUrL3oYDlMvhl88uTft`s zL0dS8AwX3D?Lx}WpdFdXIGAA_Q*vDjxTE||kcMp3pF^6LIPFI(y%6hxZ! z`P;X3vup1~MUhC97%IH#SP^}-n}&2v(WxaR0zhG5B<@M7u!HHeum3CPeV6*K z0U+>6zy+7iIZx=jc-)oR)(X+K49-W3HJa5zmy`AspW6jB9ge(URVQinX2Y(z8 z<6(?Q0I#?t1M3g|I7<+P!ot9wU2PJoYl~ZhyIa_YU3P-c8LM*u zL-7|Pwhxo3mb+%DSddpcpK4`8`ile>9B)*FkZWd3#uZ<|X<~j}uWiKEtNc2* zWj%p>q|8M=xba#((}wJY^n+ku>Ek+PH;BE7W!N_UwZHBM%wl}%B&fXlXQ&y{h@Roy z`;H6+3dU#ka*N*ZBghN42ohPmx2Lg%JbwJR)zsnV_wZpWK=P>Xd)`|dJ{-1b$^}UT zNKFbGdugqxI0?*Dyq5jY{0NrV_`vzmuVVpw;lMb^`5Nzq( zWXMYZO&mspXutO(r{DT~yBQnK4R0X61!Or+U_PY*iB>7%CK((&JWka3@H8u1TD!q0$$HMpue_eGsb{nTWFB z*^3uS7#sD<_zCJdHpbxSdN44Xt}W!_6N`(Fcf$jv&CV3jh>SL$9N9NE@KG{yXm;2AD~=(*I2U#uOg1uJI>fd6gq zXSpif@oo1lTwunEqfAK&O_C@~OV4o}HH~kbaMrv$gLP;Z1SQb$%}}U0AD?OW$0y;Y zy>Qj~^>cB2d;*%Tq(3*}VZw&4i6Jc;*C$HXF4}T#IX;{lxJT#F!FwgCP`MVq!~p56 z^3OOw_d_?uhSz~}(IWGd75fhDQyUnvD=zOxEmTl_9$S?cP8(0S-Mpbk@rHVDD&q(c z#W6OA;R?7d5QAoZV#RL*(B!*#06Qh>c7=oB^MYM7!<>(9vG3ghjjjjI?)H>5 z{>BcFO+x$W20is%dY^ZwsBl5B%Z;*{2a@IpNPP3VwK)7DKV?j&sdZ4@d7-=fS{Wo( zF6_!d*=0DN8XNhsJqTmS1V0FA2~_@y(4*ow1K*mSh5N)%gi5OK*BDg7tPDxa!NnoB zuV$Z4wM)xLNwIjj@N?(u;o1iedZ4`VLWKnOOlvAO>%nXN*z21!oE;rAaV~Q+1wEn1 z+En_L%TG+`Oirj~gc*vv2`)g{-)OJx!12n-<|a=xY^x-8gy$dg+<~;OaD~*5xS5iE zNMt>#^&EEN8F}F+dqLF4xp=Vz)W#wpWyhDImlw0H;qhq&@Mj+Q{`THH#rb|3AS(Uj zwc@d#J&>pT*irpO(0!LDAK%M4Z{8unm(T*u28G1k+}wNwH*j(tN>oio*&*rW%K*9W z!`e;Xgn0`<`0w;=6LtNeSO-*(mGJ|#!&kFHJr+tQm&m=z>9;xm^b80*JQ(~m8YFE+ znxZy}x*CwlOF>l#1Ily-F?SZ*eLiYBHxMz5FsnVN&nCD)bmgYE1(#SPHEp|k$n0vT z+@5nS&CAM#6{{*bpkaE34GJ0(Pl%B`;L-QM;-1c!JZ81mBTi5o21z$TbS8VAApUq1ba)Vp^%Ue36u5#(8tasQ0)w+Zi36ZbgO$G zWGRN1!4FnLw3>DDWG@hX-p`*a-cH?>vhPGb8YC3~W#+2iaPk7ebm1W-A*tzyOp?X? z1nGc-Zuh$53P+C$q7HEglH6iMsc(liv%k9aeS?q?E71Jw?I2UBV7qvo ze`4;-#m<+}{qSN2BW=4bamxEZ{sGsYnAKl%J>t@(OU-r(ibjV{>r4bw0T~9ujNl_& z-1;cXrUM>4;P(?ryEc26&&Lek2HwqQk1pzPl$ffBL%(g}icTHA`HzrrSMRma)zZ>B zwyb7=CBr3RGii-lVw9mGSSvaVNw9|@KqRk*CJwqwwML`P(2-BYH;>kV zfJXIrvppMXwtR4AKXBI#2~4gdsbBi;RP+vEs44A7(AhSE7K5Bn6y0N3a>OAV`RE$G zP;mjBZvPgLRg~uDwrzBefAj*8`z9_B_9K@m(+9$OO(19y^FK;gz{q_MDu9DY-_*)3LOHl))RSX-|}gaq3Z&_8C<*G*wt z*i2K01SL8A=4^Ay(dVe~Od#E~+5XtKbg6KcsDjII?6V!{)MdB;Mr=pE0^_Leaabj+ zYv*V56M_z1qP^1?hz1}&+rSN?(~M8c5Z&h~`gMDo$N$ybxklxju3`LN4qGB`C^lw@ z944o7$YDpkYAA6h+ZaC?z!8ON~fWszH+mjfgbu z-~HP2ZLhVz?X@kRtkm28|2)rqU)OzI*E6MzVg63XgnOs*Z`@EZf$ONWo1vQDwfLGX zonQ`Stj^<%V?rR=A^;i{WcPZ1vJB$6CY-`?zk9!x!O77l5at*wIY`}poWrZ_r`v)J zp9})C+B=a>@BYYMqi!sH?(UkTv+L=9mM&Vpzokh^(p2U}*<7>s=(7KU zB<{I6q-o|ZCe5bpHYc6eEo>3_?r)>4Crxg@umuy;J|Ol!T#A~*WZElj+~$Uv<2Cg6 z#xHgT-`W@cAxX39dAv$Z=XjfQXM^>MVSr!} z{c?zp5$=jcus0wbcaP9C*i5&Ak z1y_gl`p5LbItoLyt{8J4Nrv_{P9#H>NPJCtM@dPDoKz$q`ePEAsqr6w2N=Eib^M3teV8In9zuj_SG@2tM|j1+Fp3_VC7Da* z;h}~Q9dc`R&+(gsfx5z^AZ9ecgRlpcizrX&Kl;;(BqKYH@D>!rlB)n^TxTJ?=9Vi( z5wS0cog$LuX;~J(?_`t_e*2B#{>yyHZYhNb|JoAMG>el>KzcIya1kLk$Gg-e+gvn( zsUbmOqakOKpi3qjKRpk1)eL)Ub5x~v&iMQEc35W}5*(+;JD4H2=qpwY39eyQrSHWd zYsXSuzr#7wJ>PNPs;h8qw&W=>_U&kP&|Y2;6GdkZxySD9*n=H89E`y$*#!l&8yXvb zU*YKM+WFbYsHn1ZA|x$*%ghOyh7arYAi_V9(hZ~yWqBWQTg@4s3J zh0E;yzje9BUj@N@vgge&h_?Ek(cP5roj|=GPHXf!^%J}SsW;@+W#}SkL+J)@R+uoU ztDpgr-|gUn+z8bkMA9G-3%UZvX-3}7EHIH7fw)aZbenEyGlhC#OIND>4Lcn{qv&8} z#N*CXzk5yXtDqYY`H5a1aS&Eq{q(K(>sf7XPyuDWWP|G0k64gKWtLAwA3D+%lx5@4 zL#vpM879h&|411Wu4?4aks9W?X;ToBBs3DQkp)z;m0CA00@h&$gJlRoTU9{@R1UA5 z$HA%W^$?aTD?dNq#hNB*A1Sqq+H6BqAv~6-qmXZ757@4iS^RU%+6Rm|ghQf}5|uM; z1{SvA-Y-O64%qkf@&Zw`Vh$Qg4J}6@5*T*U9MAw=x=h?uP^q39H`;)$>noH${4#%D zIg*wGa-+>D6A*e8ZI@`N;bAD^pDYx*3Uum3PB$Gz{5hs}b=H$M zq!?ADTDsB5F+ZEk&313nWPT=kOfDB}Lj@H-SAISLN$>JF0#6n?o}4ZGgpgzq36U== z8UD?>IecKXu=Bxn-ykQ4;2MMyRjfY_r=1KT$V6i%5Rle&;hGFFuDvYliBkY^q75yu zp|SBUn(ilJ%YbbYvMRj^)q_k%#Zp3dE@Cbk5JaUb!yg2yx*}-dwuY@Z$DSl!DBaGx zlL7#A?nP5TH7P7vOR&L%D_gATSwf_95G_8syCC{^q(d{}YGFtIt&om}R25^SD6Kj2 zmN4msuJulGs;l8Jm+~XP7R>I7;SLl2b;#wrIR8Z5jm)T^S6HJ*%=unt zyw+$XzchB~rD-HvxbR)w??mZ5rUVN-+J)uWu@=tG{52w7*`A@#gi5!A0bPnJU4s;30H&>Er3r#E}SBMp{+P@}CZy zD6)irwL|X5#m0tG2lWHhCenToi?z{iKSqFz0PSVDcQv1=_NYu zW1woZ%?O1=c*uV=09GT z9&XykMA+3p^}DwVPB~=Xy*q2|@fQJb>ok?HaKm#J8!ZVjY*ssl) zpV(TMiJH~8dhRDAUEKuOR_cILQkeQ15Rr^@@t%H zU%c4I+aYiaU6IB`oxrKwfUr{E$S+`V%_BEW)A9i;mND%>8nlG+0T*S#2_1xd7BkJy zE)`uBh#$qR?@+OSz4&A%xV00dauk+8Cm2zTrrkv$#<~-KltjMx6g1Mn=_uliIMFoD zr>-Rk8*)TfiHIvs`K$fY&JJp|?oQRk4bKcDH*iGc@^MGu0*9YFT{7z1FZMMXtpYtMCY^-hcAT(%!&@R?XChw&3=pn8z?GTGT#dnQwHTqrC8 zm5;T%IpeT2u_|@dhoOE>r+~EzE9x)4d6%{{U(Rx(PiDKcUz0!M(bD{|yt1+~cPq}I zFLDyXbD0{c+u()PxM#$K_04F8lj&Z&g^8c$-j(QgZErL+nHj*zvWeEGyq?F$Yv3or zgoH3sf+Ep{ldJ1(thx1D9Gld(+bGVVBgUWH8Be8*aL|T7Yr(;H4Sk7@!tXdtDF?LQ zn7g<*VO-?sb?UW8qsJ=3`Eo-;)b}2KbyXo&R>KGk`Ai}WsCiFUwOQgSQX8eNhvmGV zlf^j|Zu(1O>d*mxetzCQK2KN$tq-hbELW6(u}rCZ=kV}%(wQbXICKUP?nz5vSST)a zl$B7o#&n}s225%%=y}w0HIcrdgj0-I|>Z@VU77A+H*kDH;}>WBUHMg;L38{PtSm- z!;G4-)YFg8MT_?BvCMUJWxJTs03F435wV(gK{lwx$u?LAZ;A19?#m|Kl&80+ zPM<#5p;N)(1R4ueQKBZSYNG`^OoU*8q zb5fZ2D;Z-Vi01oEhTyjhZfiaCOV#iQGDf6r=V0u_1P!C}1Oe=CYX^2{m575*$~JQd zvGR$#Nq`SO0qaa9>CNuNDSa6scTiv#`;qDyMgs)yZ2cV5JZSao>Jm#V&JzMs2Eu1`y9qI#IVgD$s0+c`HO}u~7)7zF(v#%? zKs*`k2<1VzYG9KPzrF+AQ$>5t)Qa0|z<&p~)VH(4Kq8TtAn z@m0jr$b4j4w>~UDqruvcllWxSRN$p%1@V}EcC=#KpWhx^D zODLm(Xo|LPZC^VzqY(Kg#v87Xx~S>5QL$u>FJFWv3rc4{UT3U%c12-7QGL%(G{%wM8u z2N5aztp!FY5xi#xwimlR#NiV84EFZhRl0!meqWs@aj-RAv5bbrn3P^hyGw||=*{${cy3tnK@@A&>Fp<~S_ftmA9+#NJ zH|d*Reoja2^{d!bAmUE|nA>#dFf{r`it$2u5UlJv=VRIX*qa<0-=+g`#< zX50DG=hB9r*toSk?mp0;t+AB_{)DN4_s0ec?2mW3<#98i<=))EGIgvK2cqvbt=Ewq r!XG!?&`U?9CDs0!wdnsT!KuNz!|c?SgM-#epsZqNJHsYP?Xvw}{m-m6 literal 0 HcmV?d00001 diff --git a/example/ck_tile/50_sparse_attn/docs/plot_sparge_perf.py b/example/ck_tile/50_sparse_attn/docs/plot_sparge_perf.py new file mode 100644 index 00000000000..95a13d5f65c --- /dev/null +++ b/example/ck_tile/50_sparse_attn/docs/plot_sparge_perf.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +"""Plot sparge perf charts from full_grid.csv. + +Re-run with different fixed (b, h, s, dtype, topk) by editing the constants below. +No GPU / no srun / no rebuild — pure matplotlib from CSV. +""" +import os +import sys +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +# ---------------------------------------------------------------------- +# Tunable constants — edit these to regenerate for a different point. +# ---------------------------------------------------------------------- +CSV_PATH = "/home/AMD/ginolu12/gino_tmp/full_grid.csv" +OUT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Chart 1 — speedup vs topk for one fixed (b, h, s, dtype) +CHART1_B = 2 +CHART1_H = 32 +CHART1_S = 16384 +CHART1_DTYPE = "fp16" +CHART1_HEAD_DIM = 128 # for title only + +# Chart 2 — kernel breakdown across s for fixed (b, h, dtype, topk) +CHART2_B = 2 +CHART2_H = 32 +CHART2_DTYPE = "fp16" +CHART2_TOPK = 0.4 +CHART2_S_LIST = [2048, 4096, 8192, 16384] +CHART2_HEAD_DIM = 128 # for title only + +DPI = 140 + +# ---------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------- +def is_fail(note): + if not isinstance(note, str): + return False + return "FAIL" in note + +def is_high_spread(note): + if not isinstance(note, str): + return False + return "HIGH_SPREAD" in note + +def load_data(): + df = pd.read_csv(CSV_PATH) + return df + +# ---------------------------------------------------------------------- +# Chart 1 +# ---------------------------------------------------------------------- +def plot_chart1(df, out_path): + sel = df[ + (df["b"] == CHART1_B) + & (df["h"] == CHART1_H) + & (df["s"] == CHART1_S) + & (df["dtype"] == CHART1_DTYPE) + ].copy() + sel = sel.sort_values("topk").reset_index(drop=True) + + if sel.empty: + print(f"[chart1] WARNING: no rows for b={CHART1_B} h={CHART1_H} s={CHART1_S} dtype={CHART1_DTYPE}") + return [], 0 + + # Drop fully failed rows but keep partial-fail rows; we'll mask per-series. + # Convert numeric columns + for col in ["sparge_jenga", "sparge_vsa", "sparse_jenga", "sparse_vsa", "fmha_us"]: + sel[col] = pd.to_numeric(sel[col], errors="coerce") + + fmha = sel["fmha_us"] + + # Compute speedups; rows with FAIL on a given column will have NaN already. + series = { + "sparge_vsa": fmha / sel["sparge_vsa"], + "sparge_jenga": fmha / sel["sparge_jenga"], + "sparse_vsa": fmha / sel["sparse_vsa"], + "sparse_jenga": fmha / sel["sparse_jenga"], + } + + style = { + "sparge_vsa": {"color": "#1f77b4", "marker": "o", "lw": 2.0}, + "sparge_jenga": {"color": "#ff7f0e", "marker": "s", "lw": 2.0}, + "sparse_vsa": {"color": "#2ca02c", "marker": "^", "lw": 1.5, "ls": "--"}, + "sparse_jenga": {"color": "#d62728", "marker": "v", "lw": 1.5, "ls": "--"}, + } + + fig, ax = plt.subplots(figsize=(8.5, 5.5), dpi=DPI) + + x = sel["topk"].to_numpy() + + # HIGH_SPREAD overlay first (under main markers) + hs_mask = sel["note"].apply(is_high_spread) + high_spread_cells = [] + if hs_mask.any(): + for _, row in sel[hs_mask].iterrows(): + high_spread_cells.append((row["topk"], row["max_spread_pct"])) + # gray ring underneath every series's data point at that x + for label, sp in series.items(): + xs_hs = x[hs_mask.to_numpy()] + ys_hs = sp[hs_mask.to_numpy()].to_numpy() + ax.scatter(xs_hs, ys_hs, s=180, facecolors="none", + edgecolors="gray", linewidths=1.5, zorder=2) + + for label, sp in series.items(): + st = style[label] + ax.plot(x, sp.to_numpy(), label=label, + color=st["color"], marker=st["marker"], + linewidth=st["lw"], linestyle=st.get("ls", "-"), + markersize=7, zorder=3) + + ax.axhline(1.0, color="black", linestyle=":", linewidth=1.2, label="fmha (baseline)", zorder=1) + + ax.set_xlabel("topk (kept fraction)") + ax.set_ylabel("speedup vs FMHA dense (×)") + ax.set_title( + f"Speedup vs FMHA " + f"(b={CHART1_B} h={CHART1_H} s={CHART1_S} d={CHART1_HEAD_DIM} {CHART1_DTYPE})" + ) + ax.grid(True, which="both", linestyle=":", alpha=0.6) + ax.set_xticks(np.arange(0.1, 0.71, 0.1)) + ax.legend(loc="best", framealpha=0.9) + + # Footnote about HIGH_SPREAD overlay + if high_spread_cells: + ax.text(0.01, -0.16, + "Gray rings: HIGH_SPREAD cells (high run-to-run variance)", + transform=ax.transAxes, fontsize=8, color="gray") + + fig.tight_layout() + fig.savefig(out_path, dpi=DPI, bbox_inches="tight") + plt.close(fig) + return high_spread_cells, os.path.getsize(out_path) + + +# ---------------------------------------------------------------------- +# Chart 2 +# ---------------------------------------------------------------------- +def plot_chart2(df, out_path): + sel = df[ + (df["b"] == CHART2_B) + & (df["h"] == CHART2_H) + & (df["dtype"] == CHART2_DTYPE) + & (np.isclose(df["topk"], CHART2_TOPK)) + & (df["s"].isin(CHART2_S_LIST)) + ].copy() + sel = sel.sort_values("s").reset_index(drop=True) + + if sel.empty: + print(f"[chart2] WARNING: no rows for b={CHART2_B} h={CHART2_H} dtype={CHART2_DTYPE} topk={CHART2_TOPK}") + return 0 + + for col in ["sparge_jenga_pre", "sparge_jenga_attn", + "sparge_vsa_pre", "sparge_vsa_attn", "fmha_us"]: + sel[col] = pd.to_numeric(sel[col], errors="coerce") + + s_vals = sel["s"].to_numpy() + n = len(s_vals) + idx = np.arange(n, dtype=float) + + width = 0.35 + offset = width / 2 + 0.02 + + fig, ax = plt.subplots(figsize=(9.0, 5.8), dpi=DPI) + + # Jenga bars (left of group) + jenga_pre = sel["sparge_jenga_pre"].to_numpy() + jenga_attn = sel["sparge_jenga_attn"].to_numpy() + vsa_pre = sel["sparge_vsa_pre"].to_numpy() + vsa_attn = sel["sparge_vsa_attn"].to_numpy() + fmha_vals = sel["fmha_us"].to_numpy() + + color_jenga_pre = "#fdbf6f" # light orange + color_jenga_attn = "#ff7f0e" # orange + color_vsa_pre = "#a6cee3" # light blue + color_vsa_attn = "#1f77b4" # blue + + bj_pre = ax.bar(idx - offset, jenga_pre, width, + color=color_jenga_pre, edgecolor="black", linewidth=0.6, + label="sparge_jenga _pre (BlockMap)") + bj_at = ax.bar(idx - offset, jenga_attn, width, bottom=jenga_pre, + color=color_jenga_attn, edgecolor="black", linewidth=0.6, + label="sparge_jenga _attn") + bv_pre = ax.bar(idx + offset, vsa_pre, width, + color=color_vsa_pre, edgecolor="black", linewidth=0.6, + label="sparge_vsa _pre (BlockMap)") + bv_at = ax.bar(idx + offset, vsa_attn, width, bottom=vsa_pre, + color=color_vsa_attn, edgecolor="black", linewidth=0.6, + label="sparge_vsa _attn") + + # Add total labels on top of each stack + totals_jenga = jenga_pre + jenga_attn + totals_vsa = vsa_pre + vsa_attn + for i in range(n): + ax.text(idx[i] - offset, totals_jenga[i], f"{totals_jenga[i]:.0f}", + ha="center", va="bottom", fontsize=8) + ax.text(idx[i] + offset, totals_vsa[i], f"{totals_vsa[i]:.0f}", + ha="center", va="bottom", fontsize=8) + + # FMHA reference: short horizontal dashed segment per group + seg_half = 0.40 + fmha_label_done = False + for i in range(n): + ax.hlines(fmha_vals[i], idx[i] - seg_half, idx[i] + seg_half, + colors="black", linestyles="dashed", linewidth=1.2, + label="fmha dense (reference)" if not fmha_label_done else None, + zorder=5) + ax.text(idx[i] + seg_half + 0.02, fmha_vals[i], + f"fmha {fmha_vals[i]:.0f}", fontsize=7, va="center", color="black") + fmha_label_done = True + + ax.set_xticks(idx) + ax.set_xticklabels([f"s={s}" for s in s_vals.astype(int)]) + ax.set_xlabel("sequence length (s)") + ax.set_ylabel("kernel time (µs)") + ax.set_title( + f"Sparge kernel time breakdown " + f"(b={CHART2_B} h={CHART2_H} d={CHART2_HEAD_DIM} {CHART2_DTYPE}, topk={CHART2_TOPK})" + ) + ax.grid(True, axis="y", linestyle=":", alpha=0.6) + ax.legend(loc="upper left", framealpha=0.9, fontsize=9) + + # log-y is too aggressive — leave linear; bars will just be tall. + fig.tight_layout() + fig.savefig(out_path, dpi=DPI, bbox_inches="tight") + plt.close(fig) + return os.path.getsize(out_path) + + +# ---------------------------------------------------------------------- +# Main +# ---------------------------------------------------------------------- +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + df = load_data() + + chart1_path = os.path.join(OUT_DIR, "speedup_vs_sparsity.png") + chart2_path = os.path.join(OUT_DIR, "kernel_breakdown.png") + + hs_cells, size1 = plot_chart1(df, chart1_path) + size2 = plot_chart2(df, chart2_path) + + print(f"Wrote {chart1_path} ({size1} bytes)") + print(f"Wrote {chart2_path} ({size2} bytes)") + + if hs_cells: + print("HIGH_SPREAD cells in chart-1 selection:") + for topk, pct in hs_cells: + print(f" topk={topk} max_spread_pct={pct}") + else: + print("No HIGH_SPREAD cells in chart-1 selection.") + + +if __name__ == "__main__": + main() diff --git a/example/ck_tile/50_sparse_attn/docs/speedup_vs_sparsity.png b/example/ck_tile/50_sparse_attn/docs/speedup_vs_sparsity.png new file mode 100644 index 0000000000000000000000000000000000000000..9a2f053b0b46fab1620939cda92d4fc3efc067b3 GIT binary patch literal 127494 zcmd43c{rDA+ctb@phQU-LS%|0LNpo@A}J|T$P^helQLH*Wh`TgG9*Mv<}r#wDH)P^ z&P>Kkz5BJ+eLt)Fd7k&bZ~MM)+m=O!-|xE4^Ei)VKlXh;E%$7!iqs0o6gRZ^5w zBZxH-1hHyn9VNcw(zo^|J{@wD*K#~-W9sN^Xm3KO7&_WswsE{{ZnWcyiM@lljkTDN z$X=m?f;-F{9c>*B2@6~O*KY{f*k2Ofsqj@Bud?1&QQLta7;cjv6mOek9upJY+d);>w7*~G@MU!tv{0$*qD8O*~x5Ay2ZQm zsrAh%2U4BxaybcaO`*J)OrtGo!c)6QWpir%`e5&nWA>iucT+z`%^m!;)0RDUijBh9 z|2m!QI!^Du|0n*#JfHaIpK8u$dg%ZCGud@#SjB}l|MSzV|9}3_Eb7}gt#xZw;XjoK z+1I+`?L~$cFM8+Z9*7#ezf*2=Tz&3786 zw6wHb=Og>ArKSAiM_S_w|2MVLzy0Uuuc|~ST~%}mIrrssW#D#`#t?}(T#}ZyHm9)g zi!%3<4-bUswBu5(S8zF;bn8ARosK;*942AS?;uiF;A~@E(v+^3>UhnBP0Ec~^TmaC z{@cVldwb9AX?bpKl)=m-eBo`J{N2vcQCe13R`LRxo1b|8Uip^i!1vZD<7xHtdrM1h z&vkM(8dnC&7hD-tE%9(Cf9>11$H#-%zvbF-w->vo+O|zkJC1zM+Q8v4&v`z_n)dA3 zv&^il@!FZDkq7L!##-~vRv$m>j?Xs(Soa$(%}<)O<%>^GPoFEkT3c0Bmeb*JS4c1a z@r5Fny&@vlGc(0}H*o|?ugo8_{uLR?{9HGWer09F>`UC*=V@t%W@fj2X<3aXI^V3@ zxG~+<6ZcWFJj?3l?rwPL64evcby&-**RFBy*g^dI_3K1aOmHxvtgJldz`@FzI-T#f zb=x+-(o&g|j}9?xKji48%I}TcOi;Qnp!?$SV}kE=%-e?t9X{tdT*5NH`TUuVmSvyn zkR-(kmQzwI^TxUP`DJh4uBoV~u)T7nVt%s6Zn$wzRv@3aID@3?temawULN}O`Bz4% zIXF1>S$#j3HX0Tl8tPr-I%o9e#@ZiW;;CwCYIe}A>l_(*UUwysUGk1$x$l!R@d|rI zMQ_}^xw^2ZDC2UoXkLd?dU|@~Uc*(NVkGZ=O**ali+LTV(^hV8x;=Kw3r?n%mes^w z;`z~=lr`n$H=TZbz8)17W%sq@S3j#t#QAt5Bcohn-m3dCV*B^+@9*h}pSV<)pd>o{ zSHx^0S~4%4&TGNi8$EscwDpyOf`Wau+wx2k4KdtS;G8%!JJuFWVW981?Br7|FE1~0 zU8Z!@`DgEkFzW(QA0MBZckg_*v9P>CM1A-e5ff2mk!pVU%1CFR&JMbljC;<%{PgIs z7$V9#KPYHZbxqB=Yb|wCC6-O;L&Cai*RNmSnh_ZtEvBid*@$KC@9Zp9vHALx)^)Pm zr!`;FWx6w5L`=+hv?b?~RZF&sudgo+@%8IhW9g-do0NemLx&uH?3ii}UEz=Z(2`@5 zNlDAHmpIA4Rg}EbuC6yK!%Z(@TMxO;j&#i$U%FJ;+SaDMI6E4Zn8>?-xoEE47&l}; z+&?go>9M?ESKZvwl1X0f@#Dv>&m+XOPMo--HnaHa*9SJ|lPCSvhLhtYJez1(B~ErN z9<=LzOWm|Ml=>pqgqDUzYq&8@JL%M;NODPB)l^j<+b(b6JGGaao14;$T%X1?9WiHT z=dl7^tjovf{i$w8qW9aF5|OuVQB~L0YF=BubNSoLqg{FS_V#KP%Y}tUBrd+Evo3n7 z68$P&ui%yG#|L&Dv{*3v;dgJ}?v<33Y<#N1x|8v4>P#e}VfBnIm$#YE!-s5zbM3B* z%F4%btlMl6k32hfc8|3cT%G8oc=hU)?Nq;tpl&YR%uvHu$2{*!37d9*?8UC7<}AyI zGNPK9M(oBna;?c28oICN&; zUK%=uYI8Fi+d4Ru3zsb2sINbFTaK$W&w&aFjZ7Uhw6vQU8GV~GFIncF%ZiGQHtwki zxH|n~^Q+9v$?3tfI|Y^%w|@sgf*cbTp(FNl|w zmy>Um@bsw>e)74gsWOE7_)GJzJM)~H(sXX!WfCq!79f|x-d>c1$X!sz)7iNZU)k-bW;Bw1QSI(XH zoc{6Uz_p(|3$vqlZ{J?`_T9TKMCI+kzzRg7?NI$LEghZew1JTkf1HLcvT|YDQU}ct zUUy7eve$7YxGen|h{XwPYSR36Up7LumUl+6wz;o5BJ9HP@5e4x_-}iTM1K+yjzZ*9 zy)Zw&m$*FoJu9oxK}nD?bEnD!{2ME)YZKn#v8d8~?73}6We|tvfmmV5h`PGENI9-u z&MT+^)$TS4iHXUtt%|30Q))Oom)^|CR(<#o5;cdrvTxcgD0q>~2gJAAo4GCWYgqV3 zKg6Hh>F(~njbNmsQ%w`Ec_4n5!k{hR>3(!mmZc`8m;28TeAG?Lxc+SfGNw+K6iUoH z6p<*@7}urwjK*nQ0n7fHs1q}~Df~VDA`({L_l*^Xk6JlkWzwDR#f{(U?lx+CX?ToM zrmtNYr9UNW{QYXGjr?s6=5u3hx;1Tq#|bDba%fy{uLzX5;ypCjBo+pOFt9lr=Uoju%dy@);(0 zf4}0VTSrGvA7|^KsHm7{EEalUZnCG6S<+=6%RV#0*x2~BbsI~E+muS4aY1O3{9Q)6 zb!)zrczWhl7hap9+_`fn{w-=;x^?STgXM*pAB}0O&!0cfoy_{4V-s}Hp}!L`E6||E zuMrsVfZL91JcqW8;}dtX~NZPY8cl9rWERxO~5Qj|&#w+RZVrHNw%Z&6hJ z%+Ee$-8}oXs5mER+aH94T>mvSQ2y>6`3A}H4t(OKA4<9|sGkz%Le(2u4d-d(6PUbDydk7h>koWFVqa!0#dZ+of z#sS6(}Qt7vE%&2!}-(T+{kHdKJ;>NtZJOP_biPjQNjtC(=S={mD z2fo;I!`X@BK5@nofrejSnJD`VpGab2U{KEu3X)!7CO51vLf^pDR3pru;@R&4Utxi& z0}~z2GUv~;TqyC_BX)Wr1_-INqC&x158;`cn@c=fau2x|_?6F{t`vv-V#UmTf#p&! zZ|^lHqeR!LJlJ zv25ay@{W#XEnZo$^uBc~9;e-``4z8`v2liV+abr1Z<~WSq{DSc2!!NS_CAE$4o6^& zF$VF^ChfuIzA)Xj@_Dxb zzr@MsIt?>o`#&e2<VYix2~y(tT{3`YJZ>abb7;sgaAcu z1%EHHyw6){X)(u0x;|}`5Y{iIA%4^+Z$UBItt-LJy*ef)=3C)4v6Ct)UrGK{I*+}wi0!qoNp zQh4|6+e&nHcdydX(Ftn|i{VhO{kXh1*VWZ^9hiv$?=3gg|M5$CM0 zabQG*W?qZ5Z6^`PAsvXbY}{FT-J(G&e5Yqqy>DFy1wvYM>d=*ub=PIQUccVwL;D=Z zq;-7Ox-EZKmNZb-{QP`rdpkSP`Qyhrz|yg#=Rnn0uU;j$;bms#vAc|W=$M#pISn^H z(aEuH`0llR%a)rH6ShAF2cMv}3|cALnp?OT0uv{6O2iFj>jraP#$r zf-5Sf?Z`k;$;rF6tXm_;{~D>vFF2>MmX9xq+GaW5 z?U#LC+d$$lwU2E7__(k`w2YYn32impv5m|i$>g1Y6E$p+<9r^2Hyd>xQ$Zs`Wb#$KSz zf1tT~VIk?-RR0LB?8KwHt*yGVz*u98#b{%wZ5|;_?8?l}HYL6-{h9)jsQreZ*4EW! zTU=QjkGg)Hg8bno_9Lpfv7w<`O##nGMn-aV-kkVteRd!_H)Sb3d9XI)Rn9y0ijI!h z0@{-yco8w;xk+{S2tbkGo;?93CMFrd!E`_|`zfEz1$$RnZx>o#?j9W6H8oe$QyHX< zqqmKi9V-~u$QL%TxqH7|RtIfsHqzYKcubu%R(vfbrAF@d!8G5gf!Y_ZC1h`3c~ZeE zd%JOdFe$coh1uTL)&TW%BYDl_J#00v6_-!SHb1z=-8N$*mAXS$wdNCxEw#! z)veL1`3;oOI*lq%!gNz}bL}x^Nn|aA+2QfRa+UjvJgppI8yE$Hb&dlSA*>tBdhP`T z&{lu?6dqk-(;+n$RGW~%r(RF3%Afo)_>C>nC%{5asUkJ^yJyZT7 zf3Z2%-=8MRsY*gfsWN*0M|@DO$;!&24G_roL!lxSp%%JoeMyCP@1xtcr?pM2q;Gz0 zzQs8t)#4_#vH#3?hkkm-i|2KxziimB;eIsQ;%XWi8fJo#jZIL@21r&QcC`}=rA>R` zhlZC}wZx|T_bx0oH#dLiyOHr*s^rljr+d*!pBA5=46oAot*~Ou_h(A63{BN;D#-~#QEqx`t<43<$+I+7y^ei*iBd?aHm^W0&GXW zi>C(`IFD=O3gepzr=#wB2+xjp#7er&2QzHl`V#y25*()1`^nv>$2BciMo~=BAI&TCaO8z2qLaO}=9?Ye-?(c{)b7%Ss0u{>*j( z1vxl4xK!o$RIN0HOxOKLuX`ffKw?l5WUsS-d7`U4Sdr)Iz$0gnGYPhBsHLm;w^0E; zuO?8=DguNPe@Gy#iYz$z5}qfYO)A5=vtB?U2BWkZH70r5F*!(TVIA@EQZ0MlH6)9* z#xzJ1$8dFdgLMv9R+g^i*@>o9ef+4>FX2-sO-V_~kiOXNI{v1vj{eP?H?3Qm6LQ|F zM2X4|vfz+aryX73E@F6nq;45Shuy{H_i{eRerq;wSx)*bTkQI4wk2BLyBA|4ZyX%j z0I2Oa-mYhzfB*iLIiA$BNvA&4MDH`F*ZLssk-P-%f+I1IQ$%Dl;H;dTorufKkPkrL z?yMLtT;19EULG488$$Ie>j!XH4N?m7Jx`RftXdeQq@;ed=85fZtO?t#`4R_YD+eHL*oy=&$DUD%gL>Aj{bRz!{at0 z=X94n*r(e80rCn8l+!aaUka}|F$98Z^N^U=T^eCELr##Qjg1o$458G4JcY~39 zoSV!1du36WZTt3+=;&&_1NaLzbZKd2PWSKMr;vGiW~i(D7U2a-1u@J{-E{wgT<^dD z_ntkpv$M0U%;F?{Bok+nW!ZG9YkEkn{a{^F({ZG|kkHVCI-(SRRKXYPb^~DI$khpA z`{66S{YtD46KGY{%5QBto~n^Xw|O%KsF&%{7ABHE10v}H8wm!bVbjQOtM)?e(Yvf$ zwrl}!Ns0B?y^;M0x<7?Z5N|JYa&DEBl+?8h)+e9MyP2*$Gx(KCGhO%00kE~Mx)nQ5 zJ>(aw{q$)gxWfZiMwqXRwI-|Sm3kV7@=wi={Cu@4w0HXRlVcXNjCu?pE|~Wk5Elxs zUeC5_;W}`Dd0}C}!lt&NLA}e7ii(PmxpF1@dy7T`I&tBt1@!!p$aNCdt+#=stdBgo z^YF+udaU$XLQwnFny*h)&gKR3Du;KL`E1YvoVYUHPAe@febv=99uNcMXI529stS6o z*5AVCUn^RBAnAU=Iq~-P?!?XXt(2Fc6aTpEiAccT2B1i96*H}ov9?ZcmA*)Fx!_}c zRQcZmhKqPCUA6Wa@Lc&tMbfX~!uo+^aX4Om>Egvslo8*ksBO3>jj&@8Z*C!1@*X${GY_8ndhuXbJR(x*-oZMlwOcS9SSAfWu}hvVOQmYXKfu^RqfS?1^GFGX1G z9-TS&C}H(A=fS$QR^MNTL^Jg5FWd)i1gijP9wStCUNEyZd|8X25=T|GTJ zH0>v1cdAGIq%06>(>Zw--OeO}rM$G10ND}o>(rTE0LH7om*>u$JLi2`>em`jaO##N zXd5aMl_T~VloEv=%Pb%u%Mk0I(V>vdG%~Y=RgQ$BYj5-S?{Vnas3St8X7=pdsgP{N z0zsiFbk{l>nl-9FdD81{-cYX1(GskP(D(3Lzh&q9BL|*13(x60GlZlU7`>x@em?8O z;7r0|`m=((*GZvVbsHGvb2NxM93g2{vx!p)P6EU6K~iP0o-2M>!5(CP@8V*qrqBEA zdlmDN<_Q@Tzl>{Brc2XbPv7?UmqTAjQWBsnNPY;+NT;>VbCve?cCuE3YOR>jNd1`f z7Ny?+p+KBqkyO@C2nToIjjUwQRpxWTM&IQq_wtc*a*v)bEzfn#17C)wC%p!%8;^re zJpk-XiFhbQGeb7ty2WnWNaes%T0P5Q;8Z$=s`{->92&`E;CAD14su=RgmB#ACnV&F zT-&ZyU=FX}xUs4xMv@hUO)XU_8A&YOdA$97b#*zynnWyXsHkuV*~tDJ=zH?h;c?sQ zrv8J;ib%M2V0WoqrB4hf*=TAs26G-G+vy{&voxq6RI2vF3DbkuJUl$8n?{>6`5-|A zM@QdXoEvw@S>J)lhK^1DC9I355K{zX(Iwj=16ar$>CqNl+PaX-+>-UO0lnLsU7T4z5=H5al3;=e29sP+QdxG%A83&Kp*C)efDbN)-)C61G`1%7|ojjVtt= z5e$lr6WWDx8zsRuUli*l)n9YmgTK&Vy0xyWj=CsFIimDbaGOb z=4XcXn%D6N>SWV^e*J-T?~@~OE(g_&3#Z=2#iehykT_wj{T|{Kmg}5_?$3#d(l7DH z^N!nUCd3Jz7zq1#0&H+9I>4)M43 z$ZOY)r~ee9#5whUa_m+&Rx$FR!_Mr>&2bu5cSw$}sfnq$xLB*$?Vv&sdjP7ZF?L-v zKsn3(R8>z;&rSRVH$WCZ2eoUKVkMWFlc2!hBJsYMNn2Z6!0PyyNezkFhp%rLU5gEQ zm9kLqbf;9M`XuN$vh<3Gi2V4TwHrr7XE4_(rBJvb>P=n!uZ`-OCh!|wSubUC2 zU^vtlG)IsSIY1nObCJDxF`;faFF!vHO(juEo2$cOe6q_i8F&rIcR2ZZcXzjaWemR+ z-k;j_!1g0o>A>OBQTw1w%Otk!QbB|QJ+4+2{dCXKUyzTFZ+fgvC(-KH{JiaN{e{_( z@6%t7bCB%CkERzKjKUXmQy0T$SIAukXIQnoo|bm4sy*l9s*-%=%J_U_ip1*nd?)3u zJHs& z73hj0bYE_kMZ=xwcBG9li4%j_)%h+n0&_(=`L{lh)QF^TqD{+PKfhgbkz138##y9j ze>@5eU8AqBFL(TSymExl0p~F`G)@9~rx)1U3a_2*mzb|h*d+ffIpZxUFhZ}$8*F-^ zKTw?0;Xc=9(H!CDw;l@X#?BHd+0ZQG%8djs?fFEGSDf#d%Mt$sfh3<6o(~CN9jQ0I z#OgHWJ6UGi^(f?3nkw_qUqJ&fmhl17kZ4e_&~mQn6V>ugqq3FCRXxf&%^K+vul`6HhwUOoCYyKT} z?UCNzx1-KKK0gtZlzbG;#K>4NGCI1Akg#kN%ySB}`@P~$=@k_j`4Y)TgcwPEQLL6b z0-@yFCh4UebGQ4^S4nuTxKVoH(5TNpn|PHla5Aj2S($@R(xLxkVpE>mFVWSd{{H^J zu^g44UB*x3aJ!##8a%FD`R<(vdNfL}?UF99>#V@9JzI(a0V4{MkkSifZsJz-FL}X;N{`jD=aKb=>-cb&Jcl=qz@2sVw z%EV1Y4W2o3=DA9=1Pu`_VyuugqO6T-z#z3y^jbYtLxiZQsPOgi^|k+cR!vO{G6gdM zdPsO``+H_%l5-;84I-!L*Qu$gX6#86A_$Du^7Pp}_l@8jC=4Kolh-FYE{{^{aOH|A zp@lyRlj^VA=I`fs#ziujVs+`# z($dA1?Dro&7;az^>J9we`{BMogYa^opm0##I1^$GBDu-z_HElfoX@`efp`fKTs{8< zKyB!vYteHcY>~edSa-DIt#g@Lf~Hu!*2(Zs$IQXdwswe{l-R7}{2s&-v5xb+VRiVP zRt`x*>n0^FdBvnutX@WXXTG4-ZA8ay-A+=6V~2r= z^Kx@-t5G`r!JY=>6%+s_W!kgup{Az(hEDMP#}}F}*oW&9H;^Aq28lm{F8RroRRpkB zezd7eSU-%UoB8_nMRXgNQAqn%-sqn@_ZIP3LF`2Wfg(Uk#bOc?rj3n_m(f>*NeHEB zXZl7xcpwTXhm_C9f^-QuG^_=8?^aHnOW%jI8#R|vSh2x;^}T7yQM|2XN3o=HG4$$Q zmQ}msX##U7pPi|AbYR-nb4Pqdm!v{Q0PpP8OU&+{=dDT!~m zH7~Pf?gEyy8EQ~(qmz?UU-5=~h-F>B&iI6dG0Yvj4E%*c#%c*M!ud%%Nd$$4p3_T` zd4g`{I#92@be%69I>4wws*P;HIyt$ykpXN6?;ZApKjOK=z^7LEoUANy;N}OdE_LII z$BrFq1n=J8*Y^&>e^f?>NKP8KbuS>h_i2km4ZFuUWD-g?9XWF3q6gDHQpk zN0y?VbB78tHf01q2dLjDI6#~WmBR&h&EYbFtH9`fLW}nir6Cf0`PB6EMdYh= z+usKd9=wcX{2W3XdG1N79!wxfI!j#o#0IYDm%Z>T@OUk>-EuGyRmgXd5U5ZLjQ~6@ zk#-x`<%O))Lx5T&zwb0u@3)PWH51g12>RSBeN|y6ge^J>ua!e}d;9R=Ls0S;frbr$ zzb^u7=$0Rs4^T#?Hw|K!Gy|DBwNbL+a_b3{`?-PMdgCS@H;M(r1Yv6H@Wg zPvyo`P2v6@Ki>NX1VoX`;OKbqm@kbXD3v~sIYsHORZGvh(c)bG^zh(whlC z@M+z_XBIv5#_EfDXCGY|{hnCAsGXgmu%MSvbNX7x@_y@9gDpEx8UW6at4O{-B>IEXdmJI9 z4>ATD8^enh#|lV|M96u2hE@&Be(M^NZhs*t2hwTgQ2bOF|-?)jztJzM8Zh zz*C1_Dersb+Z|4|P;Vw$Y)Uvmz>F8o#XPNsaWIW!Mo7(=?2X~m+NT$&h7xp(RKlR5 zkevNKy^Wl5tIcF=NMqr%k%IAJl9GjZdS=jI0k*`~n&>Mu(slh1poR#}2ST^#;y5WE z2zil~MW-N~Iyj#0VoAH;YY;tshR2&O?l3(){Y2L-j0nKi0-LDVERC|M()G1-Se_WPIFamQNIu4m#LKK0ntM zgG2GZ%$I7zG18uaxXmu?oj}_00OjApXj#xMJrIhof?00^u~$N((U#YH>6X3H{B3;r zivInX{8~}RKH0iJ@sWit3RoHH?rz-*G#*g$NedUdGEk&uszmYoj~~~P_{%((UUubJ z9*>(GK8{8G{h9naVNc%l?5u`m(V>Ri`V8aaq_>QEU^|Gb&6_t*OiiWsrOYR$Q$?Ba zRq=Llw1RF|N35`aXD!73|2m%pFt10{5 zlRv^c2p=~5_y77#ejVf}8F7B{buK6}PVkgK(7rcs-h5j9-TBvN@@tMk7op*EwTVARzGbW?_QDB5s~L9h0Ca#d% ztQ5si^|?aULD8Wg6nW_3)`HQ-uEjqvP(e+Ne(sQz)P<+Ms0N%nclsi+&sgF&#+*Q0 z<7I&92?7;^&QI=rZ7r3mXiBjnk1W_jLPn5r_jAsJ2culKLp{b4ysxK&mQ-{~Nhu^5 zoU`6j-;t5aKH?WIU0O}txqFwqm6(`MR@?#t)Uar^4nvtNbo+JBbs1e~RQOPD??pLz z`40`My1Fz3TI(B%Jl!z@!LhNM;4Xn+N$`#*;;nI6IkX{)JZFzJ`xjrGluf6U-^OtF z9Z^+l*3436`g(dm?uiol( zABKK3yB52}N$mWBgPKHqipQy{>c3xPWFR5V|AaK3iI~9hL&d2x`M$5V1q1}7sVOup zKjfd%v&Jj$-o5*RM})F8-?U!3PDfwe$)7U?x@Qyy3v?Gts?(l74-N@=(=u+qYe7i( z$?lFmG7{RAz4SZ!)TZ?=$VL=bg8*~>Se`R(FEdk8t*$=68i91Z)=%z+(*2!BQOdve znLU)_>XM4|o>*(_znCb$P9M966TMNCY=0WsNRJFGBZH{A< z>Q~!AM>=OTG&EjjW%3v2fU z=NvhF_^?WZ%x$*+OcBY}=H}+;4X&&5lSj?~oVYznDIN3|feWS+ye@ut6jq!eTow0X zN~W<;VqzjmLB7n$cmwf(EEa!T)q_tcQW;qThQrA4%+N2Gg@B z??bLStu`gglwQ_;yQ{&Ff90j)`fnLHh1Hm5)%C|GB*UNOZ0X{b8cbp0w86UJJ ze}4{nl~=Qezv64}Cg-2M%CL&z zy<|K$qs5jtM}N)7$T~myiB$WPnSW5>qpl%?Gha>Uw;wYW+MLE2e%o%M`6y8usbTz> z?{NY9`oA|W-}v|s2RG*-eSVfF`(LQ~7wzxP7*{w_JZ*E+Tr#@B#ogO%w}oyU*Fevw zp2g^n)x^-Yrt%#FM_JNu2h)Q_BD|oz{?|$PcirM)jF?ssXgPb4d~w&IjuTCnh#WTY zy&*$t7lfSFI$3?8H}Yd{VYlwNVB`9dmVW0K19|TyK51zVaBdHTZqjZ0vp)EXU zj*o9ba|RO^>GuI$oZcz{Ve2k{Vi{%xa2iRkd_losSZ8m>#c}2JJ$m%03xKMo=48#I z!%U!0_wL;*qpC`mQ$xCRky_s16yw`k`Hs6tBmldRl>5SaE3w0eSs??7gAJuZqXk1~ zd2Q_`P!DH^kHDLBB> zl1p~wD*(+tONHM8a`QfafRp#oQ79>2?e7{XW>1~a$i1_da80jN*U_t@rlEO@VGGDU zdyU?%b6XfpV%}#)2a?Ygp5-W$SZU9V@H|1Z?130o0m0Ai*H4q_p@t1$Wka<|1HwBH zw_Rgnfl*?YjE3ry4g3w8Uzxy{^}Osns&VJ=@Ld37L)?6%*rko2lL=zC#`98`UW3EK zeZgM!Kn|!R1z}K~J}oCspT3oovwvl-tTLzVTo4DwrFYuv#l5S?IzSkmR35VqW>{X@-lS9$eo ze6+jxZ8M9j-`~izYyRRomj>W{;-%WHcXZ}&ab^cC@Km2GIcFDh2AIjE#>s<4pz zOWreUt_gg@5sxKiz!<2n5m6CW3b5OmOCG^PQIQ4Yhs9Rv*+(`=`Z_BS&zQK4^SlXTfiilT-1^C##lfu<-xmsd7K zJ$gh7qD)RfAr6X&Xwh}jPdd_3!a+*H5L7BuPAe%L1rateJF4@xva%e=V$gwsfx!nQ zultd6`x|VNNxv0U3~EWf`_ZoMkK>bjTZ)m?24Nv|QXSM3v|D20ZwgYj;L*%z)T%DrgY-DVmz?OOnH z-hnk;k7$X)TVZ!9@9`>6AOP$w`A>t*!HiD;3Kusoue@nHYTAyi)G*sJM-5`Ug1S=j zo`fKUS4c>RZaF`Hl@U|6ksw#&zSt_0Q?&=rAc6?tl z9Mk`92>#U!s72TK*Sl7QYn27-CwK&H-tt6hdgffgo^%1|4m3C)YbYs=!0xSEx6TU= zxTWP~Qc)kfa)4-^KW?IwRB~;~H(J7?o}V-kfYlp80K-8+K-gZlVZ-jZZw?L))sHk|M2ss8FlO|vz*)GzHg?nFgeH2` zx`}h&aYkTQy+LZRXs_SFY`+2Yp+CDM3%Iu)mbjH)103@U3#$mqjcm7ZJm4>4gHVGE zUk<{DA?C%MA46xm6A%I$Q=d28tIwut+56diM?jI>K7}yXZ|jry{8RhXqU*O;^5;Be z)+;ISIwEczBqml@nI^nYUh`~LC}8a>il--|ZtObyl!Tm5o;*1KV=IKWbwKoDfok^l zuga_xlXSGTDfkBuI$}Fu3GRYECrY~=%ta|~q$*8*GeU;o?cqgRva2;Gu8(QYd2cT- z!l&h21gcTH`THH3-Ss2VrD@Lier6bw61# zW=FJ%Qs)l$TTo)XeSNEDKIJ3CnjDlF@>krI(rBHuOp3x2%tmzDAxDyQ_(j8#+eHs%B3-#mP z?1RMZK%qRL>AAUx;iKGo8suOaLHHB}llziqj&1Xi+3_rc>Y|b>|7cqQ6Sn;}T1u2X z0XDWmK#2Lyo11JQ%DhIoW)jlvv<|r&80d?JMYmk-!Uc9D(Ikxk6Nh9=@(`!wrLMe*{{u2u|gJg9kY! zBp78rl(s?J%e^+Wv#_ubjqY92kOeyzIrx^1g1sI#O}4*v%88#pWl{1@Jv>0+>FEi- zj#eUz4lG=_aW)uxBtaRIU;zfKP8jWNot-P;Tg~%jNrw9Nmt7tR4ePra#yFrcZca`L z(u;xK*T{$q_DXfBOBk0RM>;%y^)ZoymtZn~G^9q~pt}YM;Q-MZO41;AVRI8=zz3aAel4UwjgL;{K@n1xTk1pe|3>4*Icm@7=6P-OoVx|o*Zs|t}v8Z1JC;xxc5oj zR44oL8+aKEt*ry$_(s*%H*E&v$tfsEgJM4q%z&x7^(2--HX!Fsavs9Uv9d5MOwL9? zFeITQ_F^*~ofnE=iuwR~La|bDIKkhttyoFf9%q1@5g=ArFDX@3R8W#mp~iH*4d97J zT5>kQnF`HiKj~ISbm{beuui!*drm+=pbVyzUQvj%zK|Oz;1wkuL+B1{=O+xY@uV+E zbifcpI0xM3_hL@VmmK88H%a;pppAEbMd+)l>cn)qy?P!R7k0r=LZ}+g-PTAF6S?%V zcZfc#aXDiy2pFOM(4c$w9rpMWx1XY8`&YeGa&iTSN|d&>>5W<`+DHntI4BHh+e^&q z29<}n3J2?01lx#lyCadT-+A@0c|zqq0Ss)QwZDP!3NC(rw4?7y^9UskvlmQ6-B6r9 zJ02`Nqb($^(p(#pZ^iAXqq{xc`UUSU7kjoZ-WVVwWFUBn>#pbg{p639FJ*+Qz2VKO z4jSk3f$`TBnP}@6ZyTB>Y{G#F*x0VuM0%4zf{Nz2nwXjz0UyKuFwXtE%e6;MCH}O= zc86VE^xc^~ob{=NuJ#!T%rv~)zfM1}d#SR=Xi1*NmPPezB7K>8t5#8xGo9M~ zKUurbqO1GGL}ikm&RFEKjAkJRVEUyL#A5)9I2rRKM221bF!(Fl`eptlc+bS;9 zAhP-)r9tVMJ{I4Yr_%2h`8udp-wX?3SaBH_)?zuYQOw2JaPra7#dXQ%|5PWib{?Dc zb)Z270)+MeRx1LqBj7Lwc@yNXG0vTZ%^VN|rYDakR+FYenA*>{N@uAwIVc4vpMMJY zA@jj(A?@;cL&wxFd+YX>`Ok`hKQKEvs7qhQ^yhok8Q#pgHUR?cl9|iZi)4b3_9*Fd>;}(&ndf$-nECj`WTr;%qk5?FO#% zS7^^2cT~TIKmU&?c=qK!$gF{TED^7fl4t?Nd!KGtvr2F*Khm}6_Xm%+m)G^)wri9AWLi87ZZlQ zdvq#)27lem-K%A%^JA3TB<;6neKzK#nLbfrC(Yr^?tFjTBA4sG*!6l7^0q*(ZQj0J zZ_Mqu@^ODl$w1fTo>2}?!+YQP1Ae@$vj1<3$-u-EpPQ@o)r5}g*x&I}0c_RZ-_IZ< z7QalRwJq$3h6LB1RY@_iDFJo2H=f-Y;axBBPY=<>eO-pQF7tR$bX~}k;kwfvT%6jmfv+Hm5Aexp`v*_3E6)V+@OW3Pt6yL><~KV zVC-AUB%FF-YqAI;`wjHAQs%FYyBO|OCkDc|cgO-!ZJX^#N$?Et8lUcE|H$~4=7D7x zqGQq+y+x$APDUr~HH4W>6MGEp5zG92apj>Oc{#KGV~?6Dv%iR= z3$7^ny^=a8Dv&swm(bGv+Fw9#(q;c(fIj8Ys&U`;L+&>G$=CAR5B8bI)xQEzZDVz6`n8E7`C3Zvz2NV%e^&h$Fut%#K>_%J z=OK^}SO5t^rWGXd&1`Hm1Z-pgSL6c#NGmeRHOYTqi(n^si&^-->}zvldL*G@ZEgMX z)vFtDKA?POn0{n|Mf}5_@sSZ49B;rPBiQ-in)L?6hs`I~gsjbED@8skz|rw7Ok|Vj zPVx8#l)B^A(U_;0Ul@Mz1tp1er+@#>0z*bFWbE$)DJa8z7c$u~c1L>n0gEsiN`vkN zSrMdrwICB_;K1HIe!K(Y&>L};x1jG{wz3kj>y`tvzpJ%apz?R;o7FY3()xo_FRVxh z?!TVSp{BfQ^=c8<*~|Mbf1@Mk=FkI^&=*tVWcv+|es8!t=1wr|)`@X8a^MjbxC4$q z=t$o_%tqvhSnDgY@jexE#1AF73;L!!K&Gs$EJ=6j?U3Q5{N%83XFj4J=X9fTt_&k3u_}AbeI#D>{ zF#0-SaX1LYiyW6GM+?cBcWj6(>{sD}+BZ<*yYWu;q=32#4zn_NAW287E=IKxb_$M; zuXR^Prc}Q8^s8y-s){+zCQHA6NRk8#+{s$Pto~U@)o}5yt2gz_J$)`6J3VbtmzuB4 zedxIW4%Gi%xjp*x8)c53N^CLU&+}&J{Oy{#)l@T$|Ja6x@cp7xjJvE2R25(>nS>z3 z!^49?ybGomKu=@4$exsmiHRs3i1iFax9w2so0E@FN7+X z5&pgyMd=1j2Z-0zfAH{OZb(mvngcG={3lPYxVp^4<@g!yhka(B36MfPLp7g1`9?%A z0X>n?4|}=ou-kmsEzJ8$do0grVwyZ4I_BQJ8+iIgCk%vOgl~a_JTajBNS0Aiah)?err;Ll#xdzar(5E&9h8X$ zhv&(YG$4Lx0K*bLd0M4TWNO~|3za+!-^I@_bM)x-s$t1f?*4C%Q}xWf@7Qe1N66EU zj%OdY2+GHT@BAl&DWwU#-(}99(tBJra`jQeWRuhre(qIE{9g!-ZRic2sPgY`Yi!i$ z&w?3*)a;|ou`s0fk6qG*$H7$>tW%#g(^k-{An^#2PNTNe#;Ed15FTs?YzaI?1T4ut zb0YKQ%Ti5!cix-;u=M2UJKWJ>=31)=5VoYbA2NeD5a$sp>uaw#HQKxVP$H@P-0PFwaR^B0vboKZ9f}DB+I+1*y2*!2#!vBL` zffV-^Oe;AZPRPIu4;SeTJWXXaj1)&toeDf@!(o2moeBi#3=zU{wr(Wrkf>zU9v zzn1P4vMWc;mNziifyusG=;qNKVP=PeRpN56d3{)*=ecml1sw8RbJG6|(x+=^=nnV} zabuGw-;Vf`OOryHfmmcNmFp>)w6tM zoE^DSUGjubVaCdm&B>4IJ8Mz9>X zIQ0H(yB6Y>b2?L+W`x_Xz1vkluR(9u}3fjvgA9#Lih(sB#tMtPM zrLJOzJtbfPQ4{!EO`Tr7e(f!@L&n0wBEX76b!9$rz!TAUzdBFq8uOhPM^F4d>}>U! zVfn%AYGSL>otKki6&5M}KVGHu-wS>q^G8*`h#n7a3t!HzExKpjC z-Z^LKkc0}skwQAn3%W@@~Comd^Mgh`08KojD%0DQf>!uT)X=EFrCat&W2-3 z7UKz|fPu+BZ;+k@?`mf%O3H5R7Re%9zAOlTN>j2?c|KVtLS&!IQbkFLB0H~7^l0A zGKgnlgkbOu##$NJ4B#oiV3)eoE!3hiOaM)O+~4sS#3!k&z|BSK9?zcflQ9f7Q6BRO z_1}wAD)rk2N`&HKl+W*k<7C^mZR+2B?;3f`-vPFB)G2sNkpKbp^b ziaf!JLt`@^{=Xs(dZ{Y2p$$v_Ub+c_fqdEv9E;`H0c2TP62bvL_(EMfpT+;P zG&KV;4|5=_ljAb2TZSP=lnYqthxIh!=`m={_g?$iYjOCuQ-GIw^~V(TUQD#f6tn8+ zaTg7|PHBti*jo6e;CQ>dh9P5(Pye^MF{yo-)*9P+>9?QLL)AOAcrJmK8(7Hs|Yf3@7zg=hgl>~x`!nWO;-1*RjT&f z3*Sub(~zCE>Aa^ZM(gs^4mn`||Lnhz86OB88<&oj>w1nF*3-5rX1uB2H zOKyEAYHTNh5zj_>gJ%Pri(gceYv^g3(N)8Hgh)NDgUz`-jP57LfBFTwxCgH)n^1-B zY)FN_i<~GG(#`ck0r-J|G?44$M1^T>Oxo;~pzVi!Aqt$!Z^gXb8ZhdDklTJQcKohy zkIg?P>G5|59YvBT;tu*pjA>4b#l&T)3|n+ae6bdfQkJbpNBj@nxPfUBeA4IXiT8N& z!~sm{W&K4f%tI~Qtv5FOqND=zuBq<7tJG%Zo&H4uoIN`ir|?w1yRt>P`$Cl@8;kwA z^ch{ti_&t3&AZvnWd;6&&>S{ysds`4v5+;P@DSP}r%WZ4yD+!osVfQJa|*^g+;i`{ zQPJ-_S~^MuK7JGZRV&FBt|g;?$ahl^)+G``PP@p(G`wr9)OOzkEQ zg)4?v0!HqdMLekCF{UgzeinH*T1{_L4lr?+0+v=R;!7~oC_M~!uK9$j2F4TKY0p@<tB=uYzm@(vlVGy+^5aHZT`c}R7=%ASYn=&NnnbG6k zu~;zDzj#iVl{NZ&b8XTiY55;_Rrlps2Zp3`TbjfsjHRUM(#KK%$t36%d+{xieRl@>(4O$(Wr+w247o!^N#SP+R8_jbNfrEh2!(5^5dkW zTd+I&OiBOh#UmtEJ(TvmFz6sajw0ewb~H##uz^A#qJ>Ap7KVr{3tnCl^uiw*wn*`O z7i|qo>4&=YGD@4Crfte@99mib+ng)H`<(9|v>W`Ck*pmGA+Ml`B@{*M+p~8s9Wpm; zdu6a$O~99HgvS#oo;XnkgN4bzSq1p{{(6iO)RTAMTfe|tjb{d(HEkxJoP$2Z2feT9 zr-y!c%E1j0U*JMRARlT=w2NLZUI>G;y#+SF2#hdT1R2l{AS%cKrF}N-Y^1gi9;XwH z0mf}Qu{@$JFq@VmW=omNAtI8q(zruIJh$W9E0eob17>C3>*4pnOtAeFOj~%$jOSz0 zk%K1)Rt-4fO)#LS0H%m^Tc8JGhNbK{{%N-~Zx?2{M^95eul|ao&cpC8Z#xY^Ln#*s zz58~SS6}Ov{H@8O!x#i*r(IG!@nh)m74MEe&%Xdm%hN)b2i%xl%^|Yx-M8;8e5!b0 zsIZvu9+g?4;Lo2`-5~yH6}gD-x5A?@NaXeObnIVliUp?sL*0AFbG^oWz`xRxG!aT# zDnt@VLM3HnBs(OsBeSg1qU`J$C3{x(E~2c2kWk4Ak?gXb_tiP)KAm&A@8|V;{(JhX zw8_F&wG8&I$j6auhuiM`+s18t)Nq2#o%p~lae}$^taUv3`V(B zJgKB0!rN}y_m#Mqk(x)SZ4h#krvj{)nV6^nV}uw%S_Q8p4MAhhmAbHN8W{2QtgYif zKiu^4YHLeo>0Ny>^O5h862XfWT&8-I;`IG&|FLXC4y>zNf3GXPb3ll{*k*_&=rvEO zp(1xMPn)-&3o60&s=F9fr+0BZ<7cL)rSGX)m%P+UpD`a;y2$-TmD+~j9ueAo?`q`} zv)qtT8nT3$9o&0JGn_M1BUO9!V!@65j57jh31z=KmJ$^O?gox*VPI*9Hq!=512*M` zM(0{evSz+=?70L!@rIh3n%wyyiS6)NlTar;=D=&dOBW&yWFc9>Nb2CZJfz_~E&sfv zWFffn7l2I6zrGiS{mXomaCj8tZxvJ@tc+-QLMu&qjVL*fje=|O3?D}-4vK02R3)kA z7aCFJYzn3#-=-E3(OSwBSBaL(mM?mYj?7g~c7Lk(B<$SS%3khyrDEJtue-~YO=o6e zDqE_V-Iwlxgu<vsHNZFY_uVOyH=4S_2GTVD) z?AASeBXZ0BC}ZCS_HeDbqNL7u`u`X1(5H6#{kjb?4brQUh5rwIMg@Nq~nN@fyKs zPyv$&O6UcAuu)R4J0B^Z`bE&w&FuzSKZJR{&c1k-=6=_=7XpZC?)4u|v_a404n#A5>81_>P?S);&R zA+{l@p$KTJ@ObDS9-eHzp}r5wM9MYPk)5y)klhy>`FZ98fR3)C+737)ZAY?A09^wq zLJ(ai;u(mAg9x(TB&n<-X@SlA47*%C1kScMck z9<#Osxt;M}!utmXbg?jBzI^!wkjOyCw&%U~oM8P2l1&1aKN1u@*2NRbYm;?%iXaOjO zNV!G-Z-I@5*~lYsLjE0uI9N!xY1qUryA$`DfbhP3>tOr6 zSN(*D@*s}}d|2+8y`5zn9N)iSB3O{i9b_cqBu)5uc%I@QB+i`OS_P`-s2ZS7@bUGH zvZSRDP5@~lp8zY65X7g9e)()G<>$f!FGmm^>I?6_yi5-3Y&98aNzn{F)?(GLvGK9T z(F1G4?XNB$Jy>(IFrmkI%goc;=q-P(8&9=7HdP$V+|D*w$oJAtKC?+B1dl)|!ij(* zn_F5E-x~p>A;E^Lbvg77;BAl#X?_`iAwox}ArFKMkuU&|#Xj?5nHN4)3*!!Hy@0VH zv2*6Nv!2BnQaG#u+>aAuIve60ti>uN{G!2$6+!<~9k?AfEL! ztwJ2p-VjQB!ggv4x%@n=81nuWy^z~DAs-nH0zRDpWu3EKfqoCy?AWpXc+2g~W_mX6Zu}W}Xl)Q4&EUxck zn*rsv{L8Cgjz;G27Km<0*e?Pl6EkHZ4B%6u zOs*=t`jis4pix22#uYrDyE_e1f$m`QkqHvZxb`Gc*rxS({~J>akG;r_rNO3k0!RE_ zXqWJd1P|v}b}t9d`Nm9REB{x5=Our5T(WcRpRLlJCrb{{KD|agW6Zeq*!sqV@))vG z@G38$jr!IgCtA^7wd>0E7;e9ns`|RR4?Cph)&;C_zLvjWnFEsxtHZqNz$D+gnoeDA z>R&e?UkNEQ-G+vTr?i$i5f%T*PKB#iMc}V@1}%0DM8@)^74_XO<|N^3Sn#o#m2xhv zWc9?8uE$;*zM7E+5|V^^bEWA0Fj(CBdz=4B>^o}GL+XPtGO3^eY){ZVX?2b(3J}MY zk*6SX1mGzHdUgIznGShU&{N#R-66CNoScSv$F$%usk5fG1WU1C&UR#}R#(B(s_>w< z^Ixx?ycCw)_e6s0)4#q>y>pxexGIzOkFOyPM=-NeOqERdfyc7@x%-ZCb1E48G+NT! z@1i9}Wo+Q(HAwp|1w#YH&;5iE+ne5n*(=6rA3qvBx;yU)GR}k@gIY z*cD**qP6u*M=ka`!qYyf`o9jz(+8AzUW2^Y` z(3%#X@F;jTl9Jqv&(b#F;j!Yt`hyR3XT2!b!lO1O7d>G{?Mu0aWCk477T&qAOD#7+ zYty5J(1H$}3B3HopOXgj|9NAghry@7hD+IumSC%oL@M!dL2r54_1IRk@C6id09r2d zKlF^0Yi$GA54Uo2dpX=g%nD%PT;^!qn|gYBxE38a({MP>(!o1cQL%u6jeuOCZQE|P zx*#<);lYLDXA5k%sN9V5a4XZHbQ@$`pw%O0tf-$BEu6J&xibHUeM@2#(!kRly~o-; z0_TXC>-ce(66VV3aQ~pXC;a$xORc0hQvhsT`ieiar2cE;inw>r6W{F*8<*oWw+h=s zn7Mvj&gJuSoaI*WKlOvVdG6~7{tH;@TiJhDx1zWmgWSt>LQZ5M3Im|5RT`7-E^h9g z_xVrKuKQN|Z@NfK>Q!OZtTDYoS2Qn0_P>$86xpbGbIit$U*fDv%#yx=(Bva6p^55Z zrbPnu_m=kY$u!E>*=c)J6r_I1*HE&Fg^}KO1ud-wghJMX@943&e@1Q(p#rdxNK#HMDi4ApinzISRp^EAAs?mQW?AcMpR2q@v)Y(`r=U zm-?~iCmb~Mc%B`$2u=VIxdxW|NfrwJ@eo1^;pTd(OhA2 zbJCi9*x@E&`q6>g)$CX74ybVfzGR+svy~dQ5F~}rP)JaEcMs^GEx@V(i^L=U#Wbj# z6RW?#Oxwip{c6_C**gv_pqX`Cb8hh8+5exHjcWb0?57hHyDm8k5U&bT;yEN&l z=uJm%iQ*&t!+$$9r^u}!3v*R{N0>8ROXzgKkUI}pK}2fhdvs2Ai&wVKearbriGZ#>0ZcaBsG z)}!&6!6_*zvC{Gtt)H?PuR0#k>M2-6@h}uzXm-hrIn_-uJ1cLaKFd08P^;{4X1urh`r#KF7EdJt z7lCxt=3XvWwl%kn&(J2p4o`0(#}!r&1SubZ_-ouLzTe^YCkScd>>SCu;!OP9)6Jy} zHRn2dRjBBdxZ#b52-T&qP#6q;UHqwSWaI%98rWzh>Hk03$4WgTl|zw;4>pK64v537 zu5Jx1*UOeKUq}JbsDnrd(6emtd1;%PdLkuX8_xn9%bwL)@F+h969&BFX?67q0J~(P zLbvl^LJ;z-wFq>%>|kaDX-^phIobvWOHn#zTxeN)_R2W@$jHd~KpJXlbm(NB4q37* z7Ew-E6D?c8YRa{`5nYAkF5I?$U7qdpUA9iY|6;3 zgI74yyEv6-Dz~k5g)@v#(?#ln%!R#$v(`i7KZFKO?c#o;EgqfMSf;Ud@Wd~SgyK&Q zKxZg34|ryy3H$%g5KZaK$YNOfHUkSpk23|*`Za8Mjpo#jEh+V%=5*y1ypqV>se5*lk5+~((s%sSwu=^ zJQ83TSt~V9aPBQ()UB?na!1W#l;^>L_-K8I8bZ0nkiqJ9r4|AA8F&PZuWYtMOy7lS1oSYb15;GMLV%2-1#fYrVU>yDO@NK%G-Ph0SiWiY_Wbhv^Bf$sg`}inJ+j!8C@zd>w0mxM?gu z@_rNeJ2zR^qa?+IqKcABfD1kZM14HM!ED|uWeo4z3JT4#WglZTiyT9Qyq#vg-L4)X zX&aQcK)`a0z=~J5wS_~hLbAp!8r4x2QUJM$EIH~1Yi+t5zrMV zsh&nliE&T6f~v43E!;ID2y(tV=$*o1Hbx+xe$VUX+k<0$5*k^_z5$HG6I`lCuHU$_ zUMc6IvZBV&Aw4dw0!A>K1d>3W6$(rWLi!;SB1j#Y$c{i8b94C|+P<-RDid!&Is_Dk zZ3U4-R46zB;1fwLn%!OU<^@FD?kTPSWD$0aCCGKTjtlb%5>kxKIdOszCBd`vh0lLV zy?|puM7nD-7z9425SNtC&{z1CP8$h}(_XH3*=mun0=>zCYbBe)a7bhNfnV4hCVpsx zW+N)ykI!z_L0g(?0G~+`UBqQm!hDv4jcH(0l)U|eW!S4qXiBK(;%Az%H9pKo@Q?Sw zod1k5`7^?#ZqfDElpB_24Zdfxs!g>SW~aA&qd6i;UIeIN4dwy#B(N^Gxf88BPS0J1hjLyk4H1oPa6+K^eo{-q`+h0uhA z9Ksc8S?Im^_4?1cW^uWO7vATL&@cOHB!)Hf3z{MGD2!iv@J#I(*Y4N0niQ?8c@iBn z4NhC3LckV4X~MPEG|%5DqcvyiTM<2=tj(|J@( zY=M8<{-qRHf+!RSwR0_kqN@_6<>Xc&Y3mp&bR=;$mKNAU0FTd^^_kC~q>teh|d1_+BWS+~C zu^j~sFALTvPin}}BHH%nRuiB$_5Il+ajV=u7p~CKl3F<``ikY$)ILl!)Rdowxgu0( zxqWy*EwislzSsaOTW9%NY;+4L8f2113vy4B5p&qhylv}Nx`|LwAv;tqVc9F(7# zhwSP>c0rq7MxbkeWj|%<=-_51xx*TqwbBHQN zR6R02mVv{%p_Q-#v0$r7WId?=JnOz2MOUUj0brSeaOZW}S<#Rs9Hw#KE(nNL09bZ^CInhR_fk4nB<|I0#8FAXi*knm6S4G$?A0SX=XvX>LHp zAp<38eH_i7dwXw#%-jr~jf@l{UN)4~IB%E0I#1^8NcnqUc?u6|e(TTLHH*WL* zM@yQ20<^*(KriAWtBI@e0$+G9W=;{{8js9#!b*jPx;fLVC&LVt-p_%GCka%G7BAj{ zork#OvF!lL?pm^R6#}F{ydwmMaMeu2yc-vJm19!NSZG1gm(vN`5aPIig5t8-oGm)z zR`R0c{QQ;cJNM`)-&PB5u#oNM{FC;y?fVJrrg+kJR~)<&I#Bq=w)DsH7wVtF&`ajz zV#d8Rp}& z06U_YybT43NG||XVqa>K==$SWeS~?2_dBE#k$f-wvEk)XfV0EOGYny0FP2{xE=qC~ zFX4QLew5|!!uOUp#|A4&srzZG!d8hUd-_W7zbm>TL>>$14t3tdpF&_xyS_W|DkGkP z@qqbelxda1WqTn017Q!lxJSu2k=B4i=0a>g)#3a(is~D^HK~yWE2;fH9vh>EPif`g zmv=^ayQ3DGRdUNqV?-W98p;jM~bfTaA+)wGTVS&*t1R`r#VbF;f8yM)^ zHX3TkX&(5Re8MM3;wJqXB|{T#$@b&Fbbi}?BsTj<;OYGzEgGr`pNNRmK3?PJx3bo~ zex2O4#{XN(x2evAeocB;eM2j}!aoFm1M9#2)Me#eIenkOxG6aV(9PcCKaQ(@d`yXS zzPg06%#rcVIWCuH^{*951AHp~`wB^Nom*#OA<%+|_BiVHB(Yi48^l*7Qyr$QG^zAx zH9+|7QHxA#*%HEjy@x0(gxhWpd*N2t`iGFL_|DFm=$}F| zuFlV&p6hVTE|ElV@^$*!|BXj@q>j3J&rGj^}wD+n1zIx#mAZH?!$m zz6Pfli1a{&qP&S!PI@vyVY^@;hVIBbi^o5z_ zhifVB`MZ0xzPi12?rJ+vt9*>EE>pud@A@xPqQnN~AmXw>pFu{?l0u^hbKx)NfmncpWn^YH&I@NI8gmGf zh_Wk`JLY%|9Ct*f|L~!1ibz=jfLtI0Pm$K+3D1Y&%&2^Jn1gFLP5=-W#(5sL{j&$7em0QXg9);V@AE?Sf&&w(^$6gB(a zyT?MN7jN74P`$Prfnnb;wi&}k6O<>DmJ#nPvP5u$u%)Q?s@KwQQ}>`ycz7U0`8X$j z(=Z~n$9H(cFRSmAW)YX{K@mCN1fUReUNqy!sVI=gVlyP}8r0XKpIfu{pmio=m7%f< zbGXq7wFDW*442J;0|&JG>SMqxa9=J4EF%W$_?)rvauCS`mO-yzSv813tyq&c0p6k1 zJg2Kmh0XwRnrbnfhmflQ9}lT(RdRktj{y=PiBs6rRddWAR8O7kZ=%Q1j}EjH9va|h zaKEh~RNu#s+PQjA7LwzX*aQ&~0}8$b3=~}it2mu!E**?(5Q8A5jsWd&5gsNUMVROT zVRxcPSe_jKQyIzx6aWlluraPI2|+?5%Bi7cf0bV_HIS~IAx;k^M;tZ}eKZiHtLmcf z?YLBLdTaZw`xhIELb=@!XXF04|9$`Xwm>>VA)po#JuDg?Mn%cX%TsV+ak0!Bz$INr zs?$@rHz5#c8v{6mVH0Az=t>$J1HvNw{BA325_O>QF z3@?KU5SGk*4wf)$p3kU?kVZeuOo2T@FEAg`575*vB|{1^GZbO)yr`9Ykn6ui%tadk zNEmY5-_!jE_GlHriZwFAaXADO;iI|HP{zF+Y;(DUg%h`-C@dzP3Oc>MGs_5>`C8O<4FIaWj_4 z6eFj<`XCALZnXPO65#=_b;X_qCYRKKo%d&|D2Y49*uP*bO>}(>0(&?)-7yQAjICBy zRedS*A=6|%)l;-v1UJm-1C*5vgk}XL|LSaB^&DcwxK$d!6{te-stWKRK%@x#YdTm1 z&hSaNLNNRg(ZXb$SQxjD#9QDN7_bID`;sQo&dEPv0$NpyUuY=D=sdJs$#^Pv@87Si zp9_c#^K#)O=51K;2+tGq>}c$7761;-M_BJ!MqsIKgETGwOXh=I!L?#Q@zjnrwM zA3Y5IvR@5H@O<1r?)I5~WPJEYa1<;)xd8~6O00kvCJw==1T8J`)*{;2ECMmR1jT}# zq5x5xW@mud-GqO4QvDbA;~9N5$U}3-;pDpmJqe+pg^M^8AUQFstcoBj^ZBY zHbX;8@=@@ey3IxvGP!|t4Ifzq7^!U@VCV)!NkI{tdD2^eLgr>7#9PsRZ5Wufsu)N+ zpgY~V&lDvx<$Ucd`eKqZLz)AK(EQZiBVLVkG)UK?d%tO?aIM?>>37Gn%Zx&vw6oiI zKEA$J`O?{hK3)kX1Ob%D{avF*QPb#WIpN75dkK%PNlDxl48r`38*o;V*y~32Nu6} z?cUwBnU7v4NV(mSMK7y2M?;J27>-auoangCFL!w zGT~|p*u+0yNwzLGsL2=HJ%Ru#O(!ARY;!```T3rfw{P*ow2FQN@xjJ)_BU~kd=CmN zJduL_z}y~SWEXSPwUMRKz=>6IR#8#iw47`QjOHeldP#4?(QCdxIKosE6{G93q(s{> z%WpZvS~Gtor@hlOu>_i$V6{E^yZ!y5ywNhhPE@ee)80J8{EapHpUp*yG}i550J9MD z-b4gvRrO!$!Z(+2EWpH`yToq<=S+~0CW2CtSDccEYMkttzYDp#Mdqpw3|P^MB6!bt zfL5_JIupY{nUKe$WTe?=*EA->#^F-IsvYz*f25*UIlPW~({c{t!zpPS=9; zi_=2@J}InLlIx`Fb-S|^G$YELqL5FrVf&{bIyE9R0x9T z4H6(P(fOHm=p@OYBuq=Dn9FIAU2bHl@&a*PZuTUO%tKdx(7|Mjs!-YFL!&cE=l~Ib zSa>u{d+6L~EVc|Sqbrj}aRh&GVuJ2iLJ({x~ zwsee|{)nQXn>*cM(u^RU&{Ir(a^HiPySQZix*w}5Oyhyk*IXcTi7SwQfg>JO&(%#9{W)BB+D$li$A zbju*IL$m~hOPdIXhm6AR^2f_*X~psLxa+ACSCeQo_12~A2Q8|)L5+%nqS>N9cjWAy zKbL+K=RuqPTUeRT;4&d#5|Gq$;D0gc+q$n_JjN7<5Djt~%YcnlYW6_zRt5~(e6&Y~ z6eOgKfxXpYs7-Jv@PJLfIyN`*dcf$&&<&DNhrM$RBjX+H)m6I(@bCiu(nTF*J2SPH zz*-0)+=7GyVEyHQ8y?nMs1OAoxs7Q>KbB)7#L>K-#B7k+f~Y`C0GxL#lc=96BA41uS}jXmO?x-ygj%ppP zO3WE_fWK%OqVqugETbUG62ZRFaxGDtPR?L}TJbpq2BCaohW~_5Q`R$SP(vSr8Kj4y zASl6w;?&goBZBFTm1Gr9dDqaMMw_qM_N=c0k#Xo>@Uv&sr>Q@Jlja{5DF3RwbGQbi z>H>-x)2Co8ZrW>!9FW!SWts2t6SopcJdsfUj6+$WwWgo?aZZ)a&^QcHKE6Rw&)k0M zPW}d z@qcj%a~1G88}7|Dnf=j8WWK>#?XQ)Q2^TW58*yM})B^zH*SxW!KQ~$rDxzWoMyVA)zi6H8qXa)CG-|vGm6? zHeP=ebT0VoUe=oAFr&Qf_lR}y*W%<@C)XMg0uO-BGJ0Q-8!P=WQ%=aqtyWkxF0QPR z_uucue9@KU3n^rKM||GMxs4Oy(mzYiJFlQ60y&0=_06jd^EMu}zA+|w$sW!d-nDqD z^bjNt$8&5SZ$BNiA2OFyh+^QE57Jmewxqj$eh(-1WAyCywLdmQ4AVqfvCkGriaPOM z$8LENQZ3*~&?DZH-_2gmDmVA^PUp6}O2ILGmKn@w8h^Q6iUOQTzCzSw7A5XcW_d!# zsQ95eO~-0B-4zIpu?3QUZC(VgOA;ik&lZdnzJ7kbkvlhS-hHQrxt!Hb3&ef5UdYK_Ffg6mY^|nF0*8Ltw2g1TVVh;(g3Z4CQ_xYh>I) z$C@!mao9RZp&=L3dDm3_b@~68rGN-8b(Pn|M2HxHg$oz@8zaU9$O3^6 zNb)8!4gtM3w(uw1`s*}5evdbO_9RlNGPp%$9XuNL#_?_gJ#n>O|CL1DCl-U!6BezajHye9i1y#tK*d%89Ltw>qB9Olu&G1QP|bwU@&-CT#lWlKx{114b}#0305c!>-GxdGy})H|un8 zJA$3;M8+1`zuf)ujkTmY36lP0S^RY$Bnh<(JzoRc8?`Ge(>jT*iUqA-G-0x6v=;o| zen_pm^%EQ$JUIP9PzUB$qjtK9{_E&gA4QwX(Q6TXje6^iHIt1mL`_g=#Ggx8@^6w2 zXc*BLCbi|zL5&T#Wo)2%6>3(?(KO9`7jQBpt{!1LY5hYIYygjtvlT}I!2YuXVj}{! znml4^LE>+by9-?6CPtfK-OzwZTTCGt4ajG5Mm`ib5Cj8*-ozm<#VGt2SniV!NjlxY zO{uFZdKfF6(D2WL@8=;%hG{hN_S7To8n6`)EKG=r{e)L1sTCm00Z6%_y?KZ#pz$%M z0yS)^&=k_YN5Vm#(17{we#Y>W@HL0{7>^1+h^qt5&1o2S%x%Cj~l^*!m|{qL=w{27||fEOAZ zrce}+VZBjNQFpN}al8DO1C7;C=i+bpupt|8@RjY{6ltE8E%Meaxnwu3vSK#5LBB$l zf8DT5i$0WaWUnHnKQ0KWIFRPbz1$LP=nHQnBMak2Qh1`jug-XEL`8YI>*Z6gVHW1p z_#yJl{Gz2GzbDU-@|n(hK+-=OAXU}Pe0+4s6Fq$X{B|-x7W-EJJKBW9W=HmZ8Xfz# zohoO2@RlST{hml2e;-mPFP-%+Lkgvj%C4T+Fy{VFVej&( zm#s{QOl3}fG}Qm@KIe})R)_3oFJR$k*?-6VvG>zQH&Z7>PiEzRW!*~?wq$!)b;ZzZE;n}wP|VApE-tmi zq`jN1Eo&b;A3D7$l4Zt2?TUesX5gQui)9Zyme*>$dvuNpz!-w+y;0@g2F0hYH}_DFRyHn1U2ham@M?hgG)o$PjBB&skm3eZ%-o^fN_SPe*1oVfLZY zE37)aVSLH)KfVnQ->of{_xkU<*z}b2b_?b}V6^a&Ni(-6XRe5WZUf^~Uz5rHz|>f( zCQ$^)`EcI$52sr)%lC5Rc}}g_yvH2vw24N*UsvgzQ1_tc+c6`HfD~eAw$5AY)PAYe zR6R%~hG8(zR?L%aSMak8TNPD3f89yPHb>ix*@sj4)1%6@*`Lo8r!}$tIcwq8YN{o%F&fpe1ZJ^WO#`XS6 zA+M!JxTnlRT`|!ODthx8;Tq-^mUVe6{>rdB%HD0vJ^NvPmKEpccAyB|2m7GvK>CC> zl3Kg%iyu~1v!B&XkUzfo!=BWa0^Oa-X_tiGarDNwJ3QE55hL$;ULjrrABNmn%Q1~W zlbRT0$_Am88^yx0g4%V!UJkn{44K=|QxO#xlFaiC#X*X58^BKl*z-sXRq%!I!GGXD zV@O=sY3KJ}m7Yz~@PvBIT$l>Ucgou!=6hto^3Q_#=&~ArlghG|bOXD=%;u@DPV-el zUkmUgvXBTbXb^S)HwEMOBt2d74AlV$PbT=Qx9?=D1-0@TTBkF*!S&)d^t5ZR5qF~Urs0NH!tEzgg!zEZ^LS6~OU zLy#W=)VQFB(gZHE53K6I zsR5jCNJX{6Zc|l+!0OE$9F#X}@|xJ9d;C_b8)LTXJ)OU&W%sQc}y8+`LXE z03l{+>)oJ!Zkv^StNqI|+$L1-vcGpteKTB^aowY4S(-ia=|j zOH)a9K^;ng9EyBr3{koZs^OXTW?^BLYbdYqr7_GqP!}&MO)G!sA&IqD7L#P+XQpBH zi_yz4q!r(1Ki(T1?ZGe`4wOW-s2LZD?W!zfNDJVV-Sy2%f$yO>W*q799XuEce5mTu zx#KlGHw?J3%!7jK>P2>T*L-e_^owFDtgDqYV&YO4JKq&hHV*~2JEjg-8g}$X8y=?- z-Rb;2C(i}{3!t1culoDiigAsq^e0aM?$v)$*HR|`MSP$7nAO(i5jV}M2b;;es%uu! z>`Um+EQt1~%xs%W>vj%N99P$pNbO#{v0EJ=dFq$Ug1Z`3GzaQ44~dApf8uT2Y?w z4Ci2^D1`5XC6=XDWcq@0?P->ipIh$4Fm9MBUh#L0=jvO9UAq|-}Fgd zc4h%xwbqYY~@5SlZ$8sCzSJKl7L2})}~yxh%u?I*raKLzh}K}|g- zG@r}=^85jt_GzI3o0(w=xr4V~`hB{vW#1mZEuyPBER~-XGZfTy(w4Vr6sg($<$t`@ zRN1=WvAdHC6qk!0oou$EJMAFQG5(CJPP2FJ`;L8n#UEa@yfyhCvMaTwq0DdIdd6FC zI;r%mPefSiIq`lsZ}03-Zq8fFw+9{N_mzCG6}y8Ej;5D&Rz~YKBJCkf@eD zKlC`4StU9aEL4)i8)y0J|2_1QYD_J#>BNGNvbDEo*Z`qav2V1V)@! z(1X7Oi$)L`Y`z5}m7LvXyIW}X2pAl6o~p~Y<2ty1th6dcyKrI+%fPpuyUNPbF69P1 z7SBZLtwu6;HKvZmH{5LP|FGn6Dtl1vs+T42#rv%DjGcVwCrOQg0 zNk^qkGx=h|I)R=R?%F59mO|fq9K*+7b%iaT<6Q0^D7_^EXJ4Yv$>sFqAjUlx_ll@8 z9&G@6c%`p3JFQ7?*VmEm8#{vn=sx!j6}UdH)(9M=&h{8Q6#2A)bLD8`&3oqcLS{jk zQOwhPJ!NKl>bW1Wq)?qP(e$#ctc+Q$?xV51QDVm*FYPP+VC#umy-YQ$geNVfq0$uS zrztskd4W@Tb?19r^_HqR94a@SnKPrcosY;`DeNhy7-7<_u#8KB+u3-#-%)Y$0%ScF zc0DO5Tod+6qY|S;U|=Q^U>pX+<>f?hZDLRf-(a% z5rNGZrY(izPW7>lbBgRvDR8Doki#D`og*mpZSWR%MoOXm#t=2fS5QyB<_L zn4zBt!nN$Mw`4!2}lcSbNhV-E)^U(s$7!! z;e2nZK+sV|w*Q{<@4B*b<}slNy`1I8rwhPhOG@mTE49WEf@}qEF4>)x@}Q^&-V5A! z+29^)-TA&H|Bhr$mPhe$=P5^F&CktSb9kjP9PnVh+HtSUaKCRv(5o-o((MXg45$ai zBy6aNS*;}g-?hCzBsT*oAON3bwY9YqrcoGP146BEc6}DWMAx{uUYoCHKh*k(pU&&( z?9gG_7g~Mtto_~D_3Jnv9?Oqw=(Bw=KodBqWT0Xp^1Z-?-stObD_qyRzrC)&hix4k zf^y|BQ)G9K=3?k>VM5J)Fxf_@;kVc??^DO8^GsE*_U-*XeI?vWGSkL&Jl7{W|dG zwRcPB!H(o+wQ_~@(GZ@Cd)ByyWbLjycc`(Cy)C?r@|Gdj^PZ0of2xE~cTLRd0LkB6 zFC%VKFETFE%leKn0WgE|k)zq+V!q%V?-v;-F2@uf(@|MR-hqJ8b_?Y;UC4}g4cLIKMcMhhvTuA+V1UMxWmrKz_ zML~6gxS&Uv5Fg<%BnTN`8OoBr*AJ-R+JBB<90r?K4Um%8Z!jr)4NbGG5B@eSne@2H zrciCpd{sfarj%XKE=hf@9XFjeope+_M7cQb=Qud?{gCvs|4RL8*f@W`}v zKMwLLG*dOmpxYZ(1G>c>wJ|RS6_I%mdK;uCo}+f-4dJl%a4lpWJzuP^th+NxAtcLO zw8hR#a5T2ykmez~sM2+TgL_t9IdWXf1&gGVA(yVC={M(EI(^pc%nUJlU|%P?QN%&q zA{uJAUjz4?^@@y)Op#@)G>1s(Rb<+fUqu9y`MOn?E*kP@^&Z_S$1QrSsLn!^r+aTy z%d*gE{-vAu3i$*WC>v{>YD`z+c7F7mC+}eEk88+TXtQb7%o0h?kww$EU(ic7uJc}p zsl=e>zVcFuf=&-$FI6)x#tDf#s|ji%Pe?@kd0_P6^ya*aHi*pvj% zV2?b5&h%d!ZH!El#*G{7+V+Lx$KtUxNnj5m%R``khIAymEP=y0Ial#XxZ}3dMIUM> ztiCCCdbcANE5>lIpM9JUXuZ z&EIUwK2NpWW5+)4Jwh+v)3^p$$~ug-e*a)=G2nho@7TA#>v9$RJ;s^}F-g@L%TH?}**q_M#$Qk(M(_WXQ}>VBGi()n;B{U%j|MBMqOwfP+(4@kPc5TZ7R2 z-XuLPyjLTj_J|WYJUo?eGla=$V-26ah}+s8fYBtt-ph-2Hh5p^K5p5ARjTTjL{@RV zm9{;q9~05KDd%AB$jM%V-K`%s&bmCmFdgw-i9VO^({U6X|2Wtm_SWyHiD}kypao|c zVc*Zb>FU%uxUYiu>bi7~L&}S(=2AG3$g)^+PxiKKT>gX?{hQucbw}P}do8yOOwP!&B_(Psx7haIO(0=k>t_okNU>nVV7>-D zJHTBXyFoyk$i4~2VSEi$`MXcn`x60U^`m6O?ddy zW%h2(g&#k6KRh(mdy{gYFMP^qN{;WFQIv(!UlsaBB|C?0`%So&F4t>t6xYrjtTJ$7 zN-odipD?KFGRw(75HP~MV7R_1Xlk@1k}{U^A!yIEQ-X!~KYu=#OEw-8blu|P<2i(T zPtS&p%Ga(=dR(}Et({KGmJ!cbj+B@@yS#>m96|cs*7LP7PH*Y;B5P#0EZLuD{=1;U zb#GB#ng=Bt4a072)PB%0=<+eU4?b+@BXQa(vg}eIwoZ5VzUi|r;LqT94Y8ZK+EsbG zFYWZ>f{}*3A_dv^#6CEv2Oi|_=P7Em?b#GY^E7!jT~+Wro2>nG=(iYgZs#0^-;^NH zxz-rah+W_WB9-;p^>)&_R5rtUCG#q&d0@)c+CkyT$Xok~Cwh*J*QH6-R(U@#^otg_N$Z$WUW@_40`uX`vGK_qo@M! z)^#yL>w{i>78H`KtWNjHJvg9tD0f49*djyKkRMwWw!Phyd2jkmTbd2~|E#-XWZj13 z@|iU4_F@%%^>ceZaBgSl;>M5DlI_FRg_dFG&5B$TXeO<;a)_5!zR}ws_I>PW+=_Xd z17E*l!1A}98QhZpbDsFgL@;1>!&j0Vr)A&A2F`*2VZ#&LMY|r$#*v}!SqEZviDO+* zkoLu_qHzBT)!8DJ56%@IzHVzaT)3HQXXNdkue#&9!qn8m^*Ki~KU*KHw`!=^M_)bh z!~DZ-N2lh=#yLs%L>!PP4#)r5h>)TiHAZ3gBuFq2B!M}Rd_f#%F|g~t^8JgZn_e2| z3Jp&^xk>l_7E@$IX^x$P>g;E(MRxf?S13#O6_#4hOcM)lv-Q(-#!4oTpzO*FP<+AQ)6)@$aqk&z`7SLm~5A*B!-` zBRAW{Bu>_LcX}Fd>Bi~S2j`iyy6?b*vBb=fjqN$govL>FY^Rt`9g7x!G&5$oBO~Iz zjaqlCCAV|u?+#LiF>a2%dwpQ3tUQpB;{u@{>cdR#hl+AzCgso1FspeE<_AAtSGCf2 z@@weHg(z%oV@LM|4ho*=y&HZnS@Q^ev^}@8?r&bYCR7@sJcq{LoJM-oX+Sm*AY8CB zMLh|f%N@Y3Z~LvI=&&;%J8{B9^LPS|)Qc;P%7$0kq?8NX^XiOgJh3-~!!anfo$;;V z=0W33s?A&io_JqFjZ@zfBBVQl|GV28`SmEkgN7KZUUc>7r8Zjcc z&N^^!ml*UG31^Fon)Li7)0)546%O_}oaRdU7P;~`uhk&?g-0ubHD6;#d^;{)HL)tr z>EqGL#k8Cp#r=1sJ8XaZw)KFx4>(NhM*fEO{M;-?FTU%y!C{Bok}4#Po05YKe}tql zgS#`7Y)YG&U1e#nU{h+nbX{=Xec4ntd3bNVK+=vJ)Is6@riz+ zX^!U@Kh*f0eWrbu93utwv-Bxd>_Pqh(j5nW`_&ZSLQ7H4qnxV-K3JK1M76*2>sl`(nU3I~?__L!hs zz1qCyA{mPZZM+xhO<`C&X>R_gNw4khJCUK?5%r#wJt|ef6)WUY4?IsDrDj%+U8%CO z+2m{VqyAv&bMycG)O{D+Zvs2Gjm+5q#7+j}Tx*2xQ7zky1uDiTWqljf`+v|L6P};p zpLFNw%*;#I?+_lAZKc|d>E+Dlw@3GOQQZx@sPisrHrY03scHXrLyZUOBOZ3a zJW639Cx3!}n~&}NP3eYyhtxU`>m*Rqkp2U}ECmX2H@nfEI+zPl`PgA zmvf8nd3caY>RIkBvw#1cw)0)QSKn{a zIn8p}BID=@Zq|+jF1a;<%;h&aY3|hAY0Lv*$;>Qe7=LK7eEe@Z(ES6vNGB#4_B=Ri zkeXM+a=Go9@MPOh`!WQwsNC3_fQaMs?id6plM<@B1YPEJk`>}8_K;r#FIc@t8j zpWY-Me;+z_3dy|bluDKBk91o5{fm{rScHA#)f-6EP*D)inrsZTa?F_XXjBbaEy^I* z=XE>D@3@lD*9-M($LB?u{aO}1b=$&e$nskK?~YgTGKjf&2Pgks4W6&G61md@Q}@qi z#hx5$d~1BcNp)>Q(5B6`hV}6rgY8|H*KG^)&bUzfPJpIw4b7kR$7`wF2!?FKx+u#? zbKI})t7$n==7TW<_X?wbs7tfeVOtW(VUrBWopIb%&s)@8t*15fZ5OMznfF|OS;)qh zoh+MrYhR02RM(z&`f_#ui3_n#rE}^*N{JAPP`%FR^Zf9N{g~cw@VnelX}ETe#pREP z6}DW+ddDr9c+gLyjTSN-xx;oyY81Sl_pL`6nORAL*6uG4Hr*t*D&J zd&c^a(3bQQ*Ed~1X(Hf>SoW1qcTS5wkSi3)Igo3pp;mzg=7A0MQMzsaso|kOY+VGyt5pV9wKb9~a4Dkj z!YR_LqZt3YAq6Joj=dIJrDi&hOkv0zeb7?xgO^-q`r_M1*8yJdwdsE|qQov`lhPuc zDwFgzIVQ<2=iuJ24^rDE2f1ZmM5~pHsyi*gCE4`(y?kkZM#s>XR}J?fdSXoXEZ~`b zdBJ5ri<@_tcX_~H46OO~{UNVL;N#h+Umd)5gER#I@J{I1dm!w{EbDu&wA^4%KzT)T zcE4M~d65Y2-QjJXKdx?4?o6Av^vHY>g(~sPWr3owOVdC~!8b z)zU^@ecsa3oGa5uH40n;jKty^z8QO~e82QG+p%$;6I4o5Ui4Z(avfb=(S$qs@A`^) zb>U{J7(C3-@(E&3;AqjpRQ|Aa{HSQXODa%I=|a$Fh;%hkl4p)_-9ev&?h(pL@01s|?Kn^g#7_HYP>idp4jsFl^1@g2Xir8W|Y+hH)yBe?-Q) z5un|*beawcP3yc*PY0+BZ{mH<%P`usLP5anO;mwYUAhtFY}H6D6PtWf+(sUDtl;zZ^Q~&#o?CR}(flFr#!ZzVdHaEhVNZJK#& zRA-v?dtPQK>-tovc|>O4?yA=COss!ZvW|x7$EbXf@DHxy$a&2H%iSOvD{5>C=2?%L z{&$zns0fu7N1_TbltUp6Z%f8_2dB8rS*wKIvDGI_cF2^(Tw>|JI=01LrP-u7Av9!m z(oRJ6@;l>XU9q!)vV&f6N$JOxlnUwNREFP2Oj4GOtoSs!rns~3Q^MapcOsQcvj#^Z zf-?~=oc4|{@XNjT4Sp`8J*U3Vun-< zU}_tA>Ekwu7y0LETdEEf_b=PwH`}&cmBFBJU+rbP_fHBxPA&Yz>)!hyG2r)9jI1Y)wS(c-@;7fd;*&@EtPhGXHU`dd-AFEX5?Wg(WFq^nQT?=D zpJtkXaQ_%(GpDFK;LisZ;?0rLTM~W$+Z=N0)x`9_jKHO-h}jvr)wFa8b|ON&f#b^zigp1B&KO}s@1+G&08QsuVRP? z$e_Bw^oQz~J=+%W=;}Aksjb{+CH?2)daTj6$M+t@5Zm|e{T4din3K8Q#^6=Pg}C`l z*UwD*zQis7MNkq|(Qf$p;%sz=z*XT_tS27wCng#T*L|Zt6fR78yCFTuiF@{;8dr1J z?=M4AMlKx_lUhKWl?0e$Oy|>RE)o%2vTmk7DR9uARmJLiLoG+blf?C1gRefZw=YZX z&}Z5g)&Z{Zar^4Db9zeKUT?e0ZW^nawLVfKS&d7;7w_Zto6CKrCI(dg)#*V2)cP-= zGl9@KrMe*?>}NL^wDek4adv^GcTTP7&T|tRt1i8++OvkoA}U3?q1rW`uKa3xSVlN! zHtO+%j~&%ly%QL2Ont<0Hz2Ys_`?gH_-~s5@vn~Hj%dDmnr)XHw{!f3e?4#K_vG%r zcrm&@R2s0p4jOC%ZXtKHT7Mrl_szRc*6?5a5|gXANk2U#I9va2fn|(d^3ZM8kNYkJ zyC}6ZMZ8|LqxyriY*7VoPj6pI;)#n7)Z}RaBszYNCA=cpeEtLwEu?mTz84Yk3X#Em zuqppPe0>E}Re8HN4k8KyDuU7i0wO7?qzTf}4IJjfCT9ikBm4nA<_zY7m{ zcz}k9rQ2gs$^`%FQxbodE$+OagdBV4==ss>UTjo9eL}PhNhF$@b!#gGAJQ2btEkS% zf5=sc2h?NWU+39QAlD-WoH^h*Jn>GHQU?}F#3liAF~Gmbu2 zl9J7M)iQI_4@Xp~CI&LeFg$j~f?b&oTP%I^xrLZp*%m^YEJG zY~<25WB)JSW+uK9B%4SwL=@=J5Idw9PZuQh!3SRtS-28J*Xs79O_Y02dYA0#)i|d#>nv*2(*}DOaM!7pUh5=E|z?uNR{io5C1g{W63=)3Fi%09I$Uu>pxCas|tkTWyUvM)Q zoYVt7w5%g#y*h;O5SWoT*Js==m8?3xEy2h7<+v-Q~)P!#qR z@M1F$jWJo;b1ui?4`_@BI3}x)2WRIx#51EUnB#}z)VN2ObqzQLir3lI+mtM(9^tw# z?j*Y|uVCoS7_QAsurY19{a=((*nYD;*mdiIsoD_I6UIiTy_pDemqYLbZ1k#ILT0^w z*OZbWFoxg;P-x0MN|CJb?7WV0IJ~0$zQD~nIaUG}4^sfag}Fz7`*9ZG3}(Ia^~36Q zJf?g>`1WmRCW!jsdnMor+tRN|CdXxS0AcR@cMLIUE>A8-LfI6~C3NLZd`l20`0F1K z2tEW@wq!2e)y!ddp%uC~Jv~nV+Qx^kOX8LIdc$2Slu9%I37!2&nfRRqVzu5igV6_? z*0xpa9SzOg9P_}d`%84?a15DFjJ>$nB7)JL>Ax39#UDOxTM z;!VRB-9d=Rgl5b{YJQPf6?!Jt0i_t6j+`8{a}l&3Jz}ey*6Tdk!)`WM|EOEQiuw#p z%G@G0|Jv##=2a=IMR7XA*;^i?S{`mo#0z^a0M0B6z-X|~yCjpa?+%}{S_JK{r`AW( zn~t`rV2+>J*XcI$P6+_t`CItK!LVxifelM>*J*nio@jS#By*cS+FE}YHB7j4Ygxxl zB2r=u3~#W9XZ0@1>$Qbkc_-44b9WG<$ns>A|T5idPI4$iU{0^T=|1Qg#%P+R=V zy@?vph-TNhpF7(LPRKjZ=L>rNFIGCq0Nv}rl5h93{W;X{q;i68^Li(6o@dCGI*%CBt00jDX{`(YsI^q_?_F!%$jh0nVNQHEmBhn$0-?%#hg z6A3wA4D_>2nU=syB8o>+=)sT;S6theoP2f1`I2|d(*a>dM&z$UaTEp^F5qQCS`|h^ zYik{dJfov=E;m~+rM?9HCCUNzV|b9{nTdiVbZznx%0WXTRuK;*X-eEMx`e4bP*bZZ z6aSB@Q>18nCU+>>rjRy5&dF}=RBJo|9S27z2PFxKn2JiIK0hc#J1eWoUe?yjLqo(f z_N(wK&mb9Ha4q17s@dUSBY+>}I>UZ^$2)@wYiam02r41oHkVW!gqG)PKavDK4j7-o zVF#rWB z{68X8wlsX`q)_0-jaOi{(FdXDw3^me*{m7gEh4%3Uy9FpwH25S_M$`!P45ivj-f0S z3Zos6g&gLVA7IszhLX_&R?*AvT-RuHc7*RPz+!Q{Lt{jsMef9@Th5b1I^@9`zr*`T ztyeD(8}`pQ*S=8{RflKJYV-X})UPP9v$9e@n|)dClQ|tM7dylA#VjvNP8L4G0+Svb zXb{<68S(c=g)yi@KfyaV`H>570hTgrYG-MH@P^S-MMK|_HbBx9=bc-eonxuEA}c)0 zJ1OJ%I;=)DHYvArckQb>zE7d>^Nt^8^z3?%kgc!0;&I5mx?L6NE~zCoGy z>vlfy~m$Noa+X=k>G+6958`SsSjJO~AFNkc9!J}t@>VunIZqgJ<*L%v&4mxlX zXii?{_a%~rjDwceWqTA$_rWSAO@cdqvjH5*en0!h5?h;hmY+?)Lz|&0rrkGpY-RRg zci2J2NpYiBbqX$_D|c@W-;h%gD&MTKAk58Hh^&^!>c+|i3U5B8-4eA~xmTV4*c%2O&(nar@uT0q$aw|iVKITqi?!53wcsA;R?G>u+`GfJA*Z39f_7aYv zFY_z4J0WGaTw0}B>Gjm=3=cgn39~7`bo?@UCG-ONLS4~S)O&dej9sHKQD%~Y)0J8K zyx6|K78->h8{P<|EDcq&B~YznWT+Ls=?u3auv=iU`*w@wAw^=q8|1vOIeYFFn0PbV#_#a?X>M zn>S2%tzN}dhs8VQd{+A{#fIbykN)K;J=~HETNJVS_mZ3sTn{>iBnuVZ>SAsCS$eBvm#b!#fmI6-ee!np?WgT|Vv_Z^^OI|K6K9 zYA~*q8Yos3pdf_=dXH}j_3xa*KukAaCz)DV(pu3?gXJQN{vVHOl#MWp1;`?2T<*i9 zQmGirLG@UYQ#pGd4}PbYi|cP@lCc-!v0jWf&(*mEk$ zS?GudDW3edrknI_k@bSrBM($eu42eIxV<-C{%rg|R+d*y3ZtWsOJgLqMWQ9bzJA=GO(aVwoJr0o1o+| zy3=KYxpy*ZRJF~+jR}8&Io94FKYOR>vMdP>Noc-Ti_rPtXrDZKQkB@zN#`)2@<&PK zqYL>W#l@ulVl4)>6_wXN&4sSBZjFqNJGk^L{`kZbqQY&b=u0Z`e{juQ5lfQbfrYdt z__u$gzPz%MsYXOroFZmR%hx;ALfxyaYvgt&q-spmMS81ZM+tCK`;%<`wq2UIT4;4H%I8?@BkU>6Q33CzJ5J|((}cc zF%R$27OAPGo6_)Z!1;OIa5it-DL~(CdF`s!$;@r^Sl8!49#w+xl;f})z5^I!K_}hP z2=DX%LBFc5a7P>;Zpgr9RtKF|B)6WOMj&*cV!sXyL_y)KoagJNu(xlKAp#iR#(a6v z1&FM?O*Bj&UTR1n%U1Dfz^J>3oy___dSVM>6Zi5lcn91nCNcmOlgzpmaplDM9pO2K zvr^~uw+4E6H;?~6*Mb8jT{+;DfC=j>e4Qb-UjJwN2V^}jUnxr;Z0^j7;Eq_|z$FfL za8NpVx5ij~EZ!}_re5TZE)O2$98Ci;M)+o|+scUpZ#|8CM%@^B=bOf2p58BM>jyK# zY!$gvco;z6s@k?iFKYcP=L2pP|8F!DQ|L3@|#@VZ%$XQymTJr3Avzkg1m;HxWBgR-nBNVo;YA9R?f zyPeg5Ocf$OYVA9WrT5k2`92#R6Zp`j6z zLtjPF0UJK)q_if{?5+*Bh&h%ae;L+Uxxp>&#x!+MBK(_JI7R{k6rgU~1BMkFt^5x~ zL?$6nF9%o67@R%fFbu*krY?+kCr*&1Wf{svSBf@Kc^av_Gw<2i(CYv#pHqoS-MW+k zrMin3rp{|pC5Nf9zWG*jJPXWgZK3sZ*uvFWJBsxhI0p>>I7TI6xZtDS*aP(`37_2! zh_9IrQ`Y(bGltt%k2373wqRWuoP@U@5lchHkdbB`9)g%TADEkLE@FFyR10TO2Ntk2 z)qxwL@r9V?nw524fc{)mWWiG^!)Kjb8B2A7Lwl;R_E=qleVQ8e6D8mJ++^&VlXE=( zh7y9RFyo2sJUrzD4wmu(Fb@XnU@njY&^+1-d&xkxr!YdAKx0}_nxY0atgfId&8|#( zX4l%dd&YwpsVwqI-58M5zeBiX>P%JZy~9hP&eE+DTLY< zJrJH$Ys31IReE__>R(_a4>b))$-NC(b}p6i{3|8iK+|)A?g}ahvW^U}6w_0FRB96h zl+slx>G@ol4}FSx1WHv_7R z)huc<(u4QizH0`eZ`z+Awl0_~(mn}MXZ`&K1zQMeh>USZZWaPcpfSn!IVQFoL?aZL z4IufwO>V&;;7!l2x`}3dM6GL)+LA06Bi^I3Uu0gwZTeCoj!!M(8R0knZdyj#2$JfA zf&fj)hUpJ`uBvNR1SVzaV~Jb#J2M>)!RsBz!9Sm*W8#^F3m$EBCcp5l23}K(Xy%-? z=0BlM)em#fI5h+6-sr_q?P!Tf5xSFg+>Y$ZAk!ofA7<5n{Dg#Fx`FTCwF1SO8)iIX z+EnYLA0~5{cX(7KbmQ%$_ZrCc9I>pVc(m*f!{f}ZEN>{wd>E%%pe28*`R`lc9JMC1Hqkd4atH-LkN`{8^_e$M_fz)iwIWvn>)qk z#HSCUt<-y|AKq&9@aR5Sm!SD9^({6+wD%pYhRcF6soN1znMqkuUk^xy(B~~5*eFqY zk_Q$AWD8$JnF>?Qorn}x&IVlTnVy}sfd*eF7P$}H8I1owwmTNLAcnI8csV|YrB?ud zoC(8`Wj`SPQe=vSbMQCQs)B@Xn>4@M zd1tCvRv=XxpgnvgT|^%aX3-iWE~Nri>WtkH-ZQ##G!eoF{jG4$USFLsMrKnj zi&o~y?m$K}s6dtz?B=}YStcSe>S?50ytB>2^c=UJLEdkk|gXE^;)jFo5b zfrN&7qaMdaz+54B68xu69h6|y0&a6Et9%7%S5n;PK3hLy$Ut!R!uA7>-<;etMYmD)!cISFZizfGIz*|GK|dd!3-GcHNZ7=WaeQtFM%*`g z^!tlWF1?;c6sILK5`~_!U(75iG!tuh|Kq2oW+9+wAm`Mp`JUxe#;00mvo0lh>_L_C zu~56K*^CET$4LPv@p|Fv&mb9jH%FTtru5FdF=fAM8bMx)q{fEuX%;iF(HD&xZqyDC z_P0{GqZQ3-X>sQm?i;m*7vuN)0i7U#2gkqNHJutZTy%$4+9N+TG*eHi+s><rlL< zWX}Wx_-w#pfPY9UM|gN#o6nx>Ihls;oV*wI;W&Nbor_z;uINgzCH%pjG0$$XUVhH& z`_jW%iiG;QoIp40b=<1kdswhmhB`?yFfbt0KB)aMAXmc>Mh(T)p0lFFmoGnj^5ku= z{#hOBzun1N?PTNdBvg_K;0NRj1Cpt4Zi#7umM*n{@zJBGb)7EHiS53-+U~CY`<^i* zkD@p@+E0cOzpU&JEBMRQ*^M_6`Q13YttpMtIXo21a4BH8$321)x=RNh@Zgypz&OG3 z-JiGo$#HUzwQXay&hM0plN+_YNq26A(A+Bd%K1{l$}FdCcqm;JaLSp9&htF+03)ey z73rM9FOHP_o^P}S@Y)6t#1+0=`?3>!+5XF>JTWjJbZW@e6sf*-q7-R9bA=!?J@;Jx zfMjj)*gNPu2@GegDOF<0l*oJiVR`X=Y!r&DIzA`Bmrjqv`dCOwpcB*5F@j^%+!fF+ zXpX1EfZUKv2}`~C7Bc(2Xq0foF?Cz4sFPaBOyLIO+`xsR=jEg$ij!;V1xEM4_6l4o z(swx9Orlm~rg}#^z{&{#LySZuPF$Jh;i=z^49t$U^I#*%)2H~D7Pm|Co_73r=1J1N z%!vQVqy&KNq+YAW1cZ4`7p1vPO@NvXKo7j~3D6KaSZtcKVn~ul%s;pcj^9Rpqy{(+ z_4eR)7aDh+AqajS6Vm439Bv*_r1NptJW{_!t8?BCa?5-{v7_F~&_XAx3Oc(2)_b4< z`_1plh;ZYn0=OzMHIdq@WB1=3Oq3t5j*Fj78x{WA_jAb42Md?oQ2(9nK34=~jI_72 zZO2d}_F?LAi&#^G{zMi#sy`n0^`YS&wSjQ|Im48+QhR1TzQOeaNRbRJ(I-y}MKIzyn;e&0~@vXkU2=}iyaa678 zF=rpU1rUf7f<`CI`t;DN*ROLAtB^w46++}|M>x?cKVv- z@wu6y%w%3$vzY}EO=HgF0D4;6pytxq+*$H?Z#dheK7Xv|i4q?bef6dcuHw~A`ri)? zP{VOTekG`ULMub0v5N&x=|(*FPuVgpuYN&9a9`SGhDf#XkRgC!yhD_E=p0a$5T!$^ zq^JJp6RD!_T3#FQx*%>?bTg;!+HBEcvKERuz>h{b!*BX3!_GVWi z(%fm{t;qZAD=&Y<5&O+*3vUOxs{cK9W0r2_JcrhS!klr|$Bc7>(l}p0ZoL~sX+97X zx99oEd{nNEKkW)YVZ>4^BASTxDZ-Plt}Y}u%wbfm&_vb4b8BD1H_H)zyDGN9+DewA zSd(;K0LXS!qa`3QQVEaczr_E-JCJKY$u1qw_L*p0d;YHlEliAcO>F{jUIiu0g+&R9ghVlfQ|QSU-A|6)Sa?)H zOY^4fec7n0gz~|}62bXnXLi`;04_4LPq0z-e~BwA%j*WO|NTE+{}K|pHb2knBoX5{ z@`6N?mmrT)#{VUj_xL7N5mOG%d~EC&{zY07!6Pnc=MzECBm<1p$Nc5Yn9RbhZwXu1WA}olm>coHJpsvcYwAqadx-0AN1r1a}cF zh0xyQ{^raU%)QMje2pWF;@wU1y0#$+4U?jlRlutqW=eOWrd9x33Wt@u)u+-zmy!IRjbFb2FWZN{kp+Oo}|#h z66c>nc53S9K0Y{E#a;F|0wp-Z6`Az}ZUho&-;C)u6xt79zM@HRjrX}Tc0ox0=AJV5 z!}Ri+<%QL;3Ga~&_T^uF>WHn17`xPIjV3AE zg#$8gFN05bs&p=X3X-9*$06MXX;(t9hChui6H7U@>Jd$us-BT~r@{lDPp>jfeenEN z!9P(GHs1H;*z1I=`HgK z5BMg%Ecb`wu1W}k5==FID7kFO{A%0y0V< zX`|50a-@t1$O*`OVbMkIA_0L4A4gnu6L3mPMRVo0)Vxru?Us5{oxbICNV1zH?1Rp2 zaZY=y9#Y2@dBk1j+4rJ|8*s73>1L!K9%dA^aSzEZ1k|=T|o%Q#b{4 zWNhqBA^sm9MJ@R zw|c(?94$1@z7hUo2|QQ&@I{2wAaa0iL;|kf1IORqR4`L;4i!Y#Ba7B_V+fI;X7W!^ zZec*sZD+~*HxM#&3d))4i?Dg3#?d=Ei=scY%a6E!Zv3H;KdZC8(DSf*yG(xwU4wnj zW|?aUWYH&{yz%NtSBW;fWc5xK;o(>WsM5>Yyyz2$UZ%^w0q@ z2(LCkgcgdj5Wov~~@fNxfYJB^j{A;EgRL*(N|#hf{F(u!fE>0)$|=8HAtce{1F4J?cgM7 z277_7YRV(-?M^^A26>1aobDWMQ?V1NYLt$+voGOVIi$yKD81iBS52+gC#V!*CL9tJ zewOJ2+_R~<#c+67ko>$%5veCcU+bjx^h2Att91Bq@3_eCsZ zTSXs0Qf2}eU=cVVHqLTs;E(#L;EkG5g4=F;>=>DtYWZfX9GiSG!}Cn!55qLrd`nLK z{t2Wy+oqNw*BB;tTYL3-sgG^%ltzzwWw$MUJS4c(TY4VoIP-` zbtS(mDbAB*fWFQFRfGC26ySu4dY)-6Jj4LS;b6@NtpKDl(FhAu!M725cEsSl0b#R+ zZ;F3(i$O0yoGI8sGu}c}?YF)p^(=~nTx6)68r@+2rS;$y@0#r-v#h8AC0^RrCY!4< zD-r9brTKYPECvamFv9Uko<(a%suIg{Tr2W>-2JcaumwxV9Afv01fF%7e`M!!5Oa#q zGL9T+dggPY|Qke`x!pg~%hsYS0TCeV<2UpB(lxYXm4JC_OK_Vzn+&}>Yi2KlF&PQ?xgnAtikP#i?y#K8vA@Q4c z@6-Te&MxKck`j}JoKryhZdD(?NQ}_VL)Bfn8=}UoayykwD@)m6<1*{My8uVBT$^T8 zDf6JpAG_N>i{S{oWqQIaa>(}rNU?Wf(I3CutklA>ruh4LF8t^O>MIZ@&Nn}T4QP>- z1DY2hPaw>ld57Xnnk+=^|@cw!~K^9RLcLanCLj((!L^!h!?D4>i!#&Vgxig9aeP_vX(U6(_ zP54?2O>)^!UfJEd*ih{8U4X@}MsUo_bSs+DNK#^|Q@5+p{|$}ao{h!}?*i6FgI-Gi z*)@IaTbf(*+32~Ng?bC>{vplsTC-19f}dsZ-R?_@1n6sIcX6ie-JnFT-abXyV>sdb zYxMns)5P+V0#GRUuBw8*6wXmttvaA^8>Fr(Bq{0aaEk#_dun|0V+Sc`1pe3c+_x7E z1+|I^hpWj=loCXL_{qfIuWOCNDB-r021MqzqSZx{Fy=9`RwiKzpziLJ&@)lZzYb0@ z-$+tq&ELpZb$Uj<5!6P@bkg)}mRrMQ^>MkVQ{oI}316v_vDAHvXqftL-2eSn#N)DaAmzhzYgkS4;L_J%=V;}Y) z(=h(t3Hq7YoimshgQe<2!|Vec6{&U{O&f4AEsIF`31jL)eGeL%xCb{i)Mh$I2I52q zg=^8Y9)>DiI^k=Z6?Q^GtGBeNj?0=%FD>Y5uT?$bvo8+<(>lY)IdOeEy7jdA6Fv=T;8hJoY2kf{V<*oXlKVwVb>2 zr}n{_c%z!a*p>aSrYcBTcpTs1V5)B%hkBa*bhLa+Fq}Y3+E0xFW7THu8MuC;d#)Fx zfVlI9_+Q>NBn!wzT?4pbFm@Y)42E3D?JB@1fN_8Wj0#kGDS$E-@rAVvpv>A0f7kr#!Y(=|{_WB%A&K5lvJ%scs zt@av%wj#M9XS~wgEzTE?;6=7923j_6S>C`r7Zw_pKV%GLRD*vRs3+b6Wo=#bqcMV6 zwH9QhP_WAh5H>_U+_IT|j7SN=)@tQY)M)(O{>Bo~&5m6wo0m2D*AhuGJh7TjT77QC zJYB|pYlVNNN;@Z^wgO#6rfb7-%d#5tZWCMe_~mh#Ra`bBz_EXB@qxRaf>G7kOvqh7 zGa~{2(d67r)1tS!(ruZWGjgCMRh}w;69@_fgQeJ-MxPLY;T0W9v%SQOdc8RX2!Xm5 zbr}ig_%mV(vL1o8{M5Gy(A4Hak3TdifSu{8TiQTv{Wgbf<5PBwFOImAQTD4-dJRnL zB?q&PwUrw+nV;b!_CjfUJUWcfRx9$>a(eE?7{Z;=P?TpA7!XtJhp8i`G{ttlBC6if zQt5*(Qw%EI41cUyUOhg~jQ7ic7idF8tIr&fZxft#Ri;3^vVR$kfCM!uOWc7Z#&2LQ zM0{khoTWPi(z#A9F8V_{Nw5B8#|=$(eL<(bd`ez3FK0|@)uSJym{bWtW0*VYMN>li zuQCW3xZrvOG~yypo+w-37g=n&7aX5Q?Yva}+21s~e$(ExJ6Zk-1Ipo{+G!a}=jc?j z&o=aH0WZT|H_83|iXe@0&<^mYW-S9&MmCl+;Um}`ewp>Ar@tMcK4=+AHI-=@B)w~x zOJFcx=6j*#@!IP(VIgbd>@)d%5^h(9i1%3nBr@6F*;Gs4HO&`{Kr9adCT3@c5U(9% zus3mJJ*Wq}#hy(4NrQ{zFLb(G5#}VWh-12z8=I^L{Ml zth@zlly8xRMrXG!DqN3W<(As9O*7(ECafv@~ zJ)3I)l4P7djnD>|a^qjA^nXc&^Xtf5Hg-{ueIjaPbOS8-Y7w$L%4}Ejueu283&tm4 zG1Y(DZex})kbeiIEm`1q3&gD4^^oH=DbZ9l6YMiW47cRvp*G*@(}_zf-jwI_cMVES z1-l+Ub?TZ3UxNS}4}v2Kio7!+f?q6vSCskv8=YLLX9%Dw5HRg!0j6DxI$!zOH$4;C z)(c9iM_-H`WA2sGRV&|f&dv3KQ+Xx%ai&CW6wj91uEqT;8gbbg^c*&U$Rn&BHP368 zVG@F(F>&sl98n+B|M8Y6d@E9GX`iO`5hqhx@OsqziMZ|bhX( z?Jw@|lVP+`3L330^8!7=oz?7MaN-_M&>L4o3bb+roqYM!yqZ;v3J5^4@yW*_MD^3B z&?=q~KR;|}$oLGvB!mhmn~a1pg0zZXX+)>+r{fXn)}PC)X+QW9fc#V~tkS|M|J&T% z)U3g0_wxdHVducMq7PB9NQ~!AD{Lg+ zXTQ4C{n{nL9YOOiBROF=InOc5 zQlONX9Z}2rH_)rafE5B`gCVNb8rJ>9aWeJY>}@a|>O+|n2uTmaFDdq;n(4})@_ZAf z`Z~txU){SySF%j>#7^Y#I$QOHqjjcm^Ra`Kr0sy>C4)2mje4_tRKcrV_ZEFomJSlh zDJvO0C&ZSPme8Us@#M(`kR^h(Fa&fn0B`{2uY`8Ax1VX{M}LAusGfCKNC6oDEEgG{ z^!CHhf%p{QW}bc>B=f%~&rDb`?=wK7YP_+*-%kJw;3j%30k(QVu=vAc7p8iBfGM1u zopT`^3_h4A-~+`mW@Y^OQ+#*J`bHU0r>;7?%HtnrpLL$YtwMND=Aq|ZDe{YrCx}90 zhD1uBY!@R&x?VM~*zQeoxx{IBM?OFj6UZqNyG1%6=M|APz>0WXSa^(M6jFZ9oWJ^n zg-jCTP8~UMf=08WXf-XTa2LnQ%;XR3O5sFH?Hu!9lyQ7zSiaP85ghYO2)V8 zIYM^mN$mn*rGSIval-i~sDGP6-qQndgW(yrNSV_0NQa$gDOZ84&bR94E zxR5d%&GSnSgUh3#VTHfiS9FaGB`NViqSxclu2(gtoHra)0B_LR;;ctUM+m40fjNZ% z=3`Y9pu#@*nEoVD^qS?^IuFy!j6_>2L;B(XCMIjz26(h|T%X_@f8Off`oG!`PYI3r<+FVXp$L#aD*i>2j z9B2XTM?J`FEhvR(U&<#bvjOWx0_P?WlW#(9`SFjOH)AiNXGAw8Pj6`lO#$zmSOmqs z4UG%~1HsMYw2c#D!{i!-Y{g6mi*|e~RRlF)@`^7DA~-Fx1~?TPoG zXDK8y@-gt`L&cVJLHsLU_SxF&;LU2~Ta;K1YM?lDpSQ&5)z)Y$DC)Y}XF_fvZ9%GV zPxS`2%OGg|W!A{7kIUWm`*ar+r?UXo{4=(bcwo0HVf@B5Mlp`G~iq8_>>> zN+D30goB?tcQ^)zZ6NaiMeA3YVh+clFtj(Iu^==kG`pg?G!#!BEF82lcHVGWe|4ay z?dmbgisz-d-Ou_6CGPSntv4oOzfS??9Ja>rg)K1c3j(1T2NWBC@WaSiZ5Xr*kc9CW z%Cg-64p%~V&D%A{@U{`tz4D}^qdRrss!>Yuz)!pO#ncz-GfFbvhZ#)S%ZGnly06pi zIKGgq8K~bNVE5{kDAfb>yjao<$|}{szw$?2-KC2cPkF_}kiqxuOrpIe)vQpr@7l)B z`xwizEZwN*VBNs_+C7RT|Co&1YJKjyb)Vlcp)%ud=9 zY5a^$#-B3^$0LUgeG+0qw|{B}-ympclImb~#;p)=}b_qf0_?e3~- z746z0IXO898Sf|yNSl(UqR;l7#mubl`uOx%)5;(dmg(9gd(#EaEh4@vS1|H|$R|3x zWOQ3mT_t&ogoL*G;VN-_y`0}TY}WVYpXzlKYeXdAYtqoBJawl7f?{|Whqx#RlA4F? zBeG^^r|UwCD3JtoUClZItxdRoN; zD%+&e`&z$$CQ`3BaM7Q>dWS6L%;o0URN166PppPiEG#VUq-T2kA^1m>HR`+)zGmYz zPsqG{b#TujJV~YLVmrU^9ZbFLllZ{(_xH}++Mk%EY+L#S@j#WfLQzo*$m#x6^*~ZO zh!7yCwzI%k{Xl?rkp1LrU9GqNBWXy2C*z6lf$_Xp(rPi7esuv+r zG}=0&j-#*FGK*h1FS20Bc^*(03Jb%^q&gpUQ<<$psi3tv_DyC&xOLA#wonnRkYJ{- zC$8FV@eQ7;-RWn?j;o&MEqP)Y_=Ut++SEuD428+JT^K{@et#%^L{fP zV7r<|G{j|O3_(b1x7Zs9V+|5)g#cENln?4>14zox&@AtrK-=2c+XLNiZXa{TBT%R3 zVS`jTN0le#bxofR+^-{|=?nj-UP_4%3>lqWxV?(-o9gT5nsz$7T2k-Vx1Ko8hC?n( zEavqt-^u*Ry_KJik+(KbrP}>g)|jv&>AIz_mE0*c*3>GzK5*%8;m?Q;70FncbV5dI z0!qDsJbeTKgyS&$fEl&;~OOK-!ovL+K-R3qpQ^kS34u6@`MJ9Ioy| z{Kz~?SX&t{D%Z7HzHr9GCEnkbPf90WZJ?cIxcU6-Cy zI0SNJT7wNer=wcitPC6C#M@lt-vJE$FX57YlrCSI`>33!FGA*Ey>76e(yWI*$o}XY z{s474aC5G^z=_^*T+t{EwqyhEEQS$uiq(Y~F_&;lj+|y%@oT_Tu47AhQF=6SO!TDM z-(l-WBBkr%VH4^qAz@>GWz5Sl_jEJeb2!9nLeA^gF(6!}z{q;0mj!~tMI6%V-;Yep z@HxbHG=p$Fqn}I~(_Mb>c711=L!4=K$GzRsgIiPKBmw5P!jbbG>_UH@?}x%|U5sA{ z*&JNo^`6S)&MMe{yZDc=7Ntf(JwWUA;8Upp?;ekEQKQ~7c!Xd;fKwe}WJLyc_%q}k z;cF;{5lb?u%EQj3uP40Dx=hbPpCK9jMD zY0tk%6vwZ7rdG=1Fly+9C0n=Na;{MTeZ9J4Di_P4Iy<`6+0Eg=kn799R}K&Mm1{fM zr3pw3wVM)zG%Ha7Kl-=34RupJ@y7M*KD44i3cI%<4!k+?mNyKt{tWR>FQAz`-(gt| zF5w|0rDg9O|7&GLjvdu0*e1^sRC6%Vf`$?{HuWMxXlA;4KJr+TpPr5m3noqHS|ylE z&Un{wXtF}R7Fg_TY&f3e^#18RJZ>U*3HSqWPU<}uE?UnhWkaR${RaekX+xBbFh!UyvBxw3q%@*wl)qCMR)B ztk%3-ue|Ni=TlnGpQsm`Ha`|-k6{7(AyGzGXhK3TToCZ|>*0`*qndf_2`)uo3|NLB znB2$LSG7o2S2xq3i;kWi{D#bUQJ|oQ3jq%&$m|+-cIx4E^^!HAP*Y2dA=l~XRD+-t zBg5oIbO&tDMCl(s1&{_}6=#Kocchz^@Fjt?ux@N@EDsG8z8M@Y@>WumXnWoKK#>C6 zkM=pPue0N%8m4lSvF8PCX~!&tq?EEc-gkUV2+8As;s?1%6VgtRZVHzS|2mixU7el5GMzk)6h>5i58zuWJk<6jVS)CIq(`Rh3u^d%2_5gj zco>zjWa_LVl6@Q-8rt#EbM8Mr%KvyKxsrn1So}IU)e6qAWY0ibHAt0J2TYx*OT({N zg_=6s*%i#3P<*jxVY}Y9{zs4R+m zwCWwclev|INHOj;R@NZYY;?0qZeV^C zw9hKSD|i2!4GJz|ftGVXsXA`vmJArV;L8Y4+R+Zt!Or~D2MjMcB01mn|sM+GJMxyH<+ zM0GOQZ11dlXvUxuCtyh>VNLKs31bQUlXfqyY7aOvm#BND>qr)?TdzKLs7n1#3RGUe zf`R}?5Z~Y}-MUpMwq1v$fzZXPKseuwrF7bouXuQIZSLuS z(c|-llslF<1X(Y9c6Ey92pb58^ve8!fT;ol^J3sz=gJbt`}-Nb{a!*U zMgk!G8*6@im0H0G2~@D*Cj%WqBq)^%Gw|xxRx-dh;Y0Qop_N<_-|d?oL*vxtsxFt| zZ`n`J$S0{6rY_!8^&=qUG3J={L@#eG4Thbut$u)>{R}S=SuCTuO__(_+%bS&L4fnL zcWyzKj!M1(9vIRK1MHUv!yv#@wJ3ihAVbLMb?FeR6}3>(r*9bTxgXdFb}VY+GEJvG zMbOKCXk27ak7vdnpIEBtE|QOGH~C|wl`h+!?myeCl_Zeu1r7_{jKDNKf<(rCShz}q zbl9Nw2pG)=5C~NS;&1@N?BG%p6PfaL-^_A=l?k#9S5IU*{-Y+_y zbSZjH&$u_BKm4#nAaQfs<-=PHna`txk*@r_U%&OFb{~;Hy07$yHs$eqNI%x$&@OGs z2~wtrUp{2UB*U}oC(-_wPpI6IHBP|!J(!80Ucs0sFL8$63@Y6__j&b*wi*TSIhYt)7kmCIlBI=>@m~Dq}9*H*Ez1jfP zJ%LQA20_^`6R0qCgSMQ~FIu&cTcz%X;4{%!chEc=t6qQj-Aj~``e#&1A=8&vAizg_ zgP@JI|Ax*~&ZdOQx%}vPyGmB*cd2~-rP)_4ZWSlGG@UdVS}}tT zH)ZTy#FLr|sQ-CPWOUAU780+rZt3w+?468`+9qwI$J*eaKta6%5dPPp;fach3apCt zAn1+~aJG5ve@+__D!j&d)sKrp`!zc)uDNI;Lg@!XkKRrR9R>JUR^^)Y#jg>2=+>L4 z$bx3qdTFdW0b&9l^>315V5}L4J-nyz9>QnMIBm7!%?)8csx9w= zj}gM)*IGI(AGb+hk0(`8St}=%b7lRC(EE^l)0`Dt*SY-!1~50lnSuwEWq``wwX$Ld z_bjlZk<-(9kg6gDF{&!iSrHhYCH~k}4CAfcMOX(rJs$u zg-k07Pm*4`@9n%{m8mV0@6z;)BMQ2?pc3rOZw?{oj=pgDvcd-gD`UMiG+D^ou&;>! zfQv2^3-gMKW?vvZt&8T~G4;Lwh~s4ijmu^`UlTI?>`A{OBd~TlyJ8&&9_QhG5F+6O z_kSdK4S~x@RAgHAxXtDdXB3Qnr;Iv3y{-#W&;^~vI`!F+XM!tggmrENEi3Jxm@#ZU zgd$FMu#O3&zTUZZH=oZ!L2<5Ak0XCgfqATPa;%T;PwB@G6SKxfIw0Mi_ATCO-1@yl z5Hug$=B#*>0+FD3;xX%gvLJ-W?nXx9kVxu}De0HtvGC^g>r*(CxNJASMq+zbVPL;$ zF6g>blSuGkQNes6?Q=o%%=4V2ozRw zclZ9wHwex;H+OgE$=3`{J0q9kOSZBhiZ&3=f;XUtdJ~@|*Dql!*8g_ke0lZDt?W~e zgy;o7e~UxE&d2z7=s9dW9X2mQd=lSVeoKnnX$>vd;p$|7;`}s|tXc=z zYmuD>I0S_rXB`b4@zQ^ptrf!8w?)j>1MEW@zeA@$(=ZI6PrZf0FDQzpc*h3=1vEaX ze>Av~t@8u(ZAg}s`3Xl%i^lQ3=|^qXxYg9I2LCsm1(adjIGdBzw1YVtGsgn9^-`yu zN{NRUph5z+s?C#Ws?RKgZO8tv@tsE&v{Lv@MY437rF7dDagHNFIW~#Vwp_5Ub z(@I^<_8H-GAw#G`CS71~()UvIYqt2#|@yWhN$GgXsXCVEmvZN(OJ>(r~Ff@n)9Z z{}wSWJuy)L?Ce>PT5#VNJc10bG74@+rh`r6+Pc8{P%U=*WJtHl>`Wn>*Hv?g0g{tn=!&O^}?QdaYtA^s_} zZT!$$1cpc3eTyhlJ^kQ2HFBDXhlk(`OL=p8ss)!j)i$6LXD&!JH4PUBuQJNtnw$M& z9D*gdB00(8OxM(_AI`(qK5vF(L&d>`WQ=_8WkCfOrO_Hjww>pr#k^m@G|xv6oIQE? z@HEubQ^N{~9HU?wWZ{F(g-d-cdOm@cwOMTb6^Y*bC%A$MH&n7y-z1w4eosCTIQRL* zkF-kc5vdW?{PuZso1OFrrB@SbhL=BN40^tfbyGbo{{r@+D*k;Ni_vyD3JF4RuD zO2%SUNh0qOA{%Ccg-=~Go|lXL<1m%m-6gD7R>8hS|1KVHeAN^^5S^ht*lf2Php<2& zpt3AqvbzJfx}Dii)VInQE5pUX+w&CZje1ZG@Dz=WL0;{r)K8)=Z3sE_hviFb4~&yKMYW&-GkPEg#SV{MF$4yS8T8g@R6QHdz%A8DA|(s5S*MCq+rwR%-~T8ie`g92UiBv92-{9ugl7xmJ_WzISR z?Lp)aPz8V@QVR!fP76p_bbB;hR%@^kiXIeb>3?<&a7D0cSDn9`sn)a6G2Wx5MSRFX zA$_PPxH5f)w(4ckj*{`6*$M&nn6SuPy|&Vd_#f3Hv9JMYB%~JewN8$`ypBn=a_|;< z^fK#)dQ9q>MzoNMtiYS)wehx$X$*hCasRvh;9gsX;!-e+6JEWlvVcSfUxPYN19vb=o}#`T(M z)QyivDyzmA|Btb^j;eCsy2nvblu#r@T9igXLQ+5h>5v8~C8fJd5kW*ix?8$Kx>Fhn z=`QK+4gA*D^S<}q_kREQjd6~LamL`B{p{xxbImo^ocvF(t13T2xa@hu4EUxRy5=Rt zS8{8e7Y`g4#8Dd3B&l>qSU|d%IX~_nA^{FNWNTy%!?{?;s{lmIuoj-(@O39gmstcl zhE#jooPdU)rIe`WE483#yO=;;(Yd#mEy1rq^KJu+xq8JYe3AB)bh7`{sq|6H`#{;y z3|^Xm@M|=o{Tnm2Qp*lA~I(Yf|T4w#5Wt$zZ@l!%C}@mGbIn$lj8aiU74a!pTGJ+#EOF z^u+v_me7Dq&M1+Yvr_hOnIUMYS|6^d!F!*Y9 z|Nr0AGN~>a>X8nQNywIc2WG6s>OT2DB>yG*1sPJ*UQVH0p+D(^wM3r?R@O^fzXyE4 zF*xZkeO#nTf)PXgQ_t(Thv%kx5NhcHhKyMS^-|Y%ZPL;*))gz|dLI2Bj|XI>9DfJx zD+F8N$~>({&TYrxK$aa+)eRKR6Ss@*D2O%$8;BkBUq^u3f0p6Aq2ed__4JB&xJtlk z?3*8Hy@mpzp+Pk|Xtx9Su3hM-w3wF4Uie-lW;Jz+GUiZyTgrE^zw2xFF)$)fZI+Ll z6!R@%XQOLWZ%Uv$zOsrfWDm}}?2I~Bm&||Lf*4Jz(}#Th zT36$GDk>@21he_DL~%$+OgiKh(Bam;@iDS1|4e>(&u)9jiTCQ60p7#wnpfq9a;bJk zmLAIF?fz9c+8>>IX1%hDD;Q{oIOz;zMQ+&ePU)cX$X?D42ehD35aaiTP3su#^K1A% zHxO(eh;GrBDE0fbx3y6~AT21pp8@VPlbq{>@9?dHbD2L74c-|IfTB*c(W zQ5g_3v_jAlgt1O*V^c@Dw=oci>+=R`0i`79hb|boaf))nvxRz1HM?3JB^M!WsI^-# zUM69s%AVY&zBoJe5gZ~|QtWXz))l7A2rl@-ZTtfc+?%@7W0mG0M(Xu>-EkS`#BePt z1Op)Fz~P(Wv|hy2I}3!@Shk!R4Kq#{FhCWmpOtQ9mGiHFYbaR7*}`FP@fdpKoUSKq zgoK1u+bzT>FbjnYc7{Rv6Z)t~%Va1}b(szwN2?{!ULULc_*6ueH+FkN>(Ubj`gEMVa60`X{9b%eZ4MyF*(Ar;|npjw7NP;?fD4-`ffN`a zv8fR?558Lr<`O$#(_cWt*GZMczfcGnc}uz9!@o_&BINc?=__2@R=CR$_YgO{hkCJm zM1GBf&_p$$4MygHgn$r3gOMS~^;9z}{|J~NluL>W5Ey6{3c%O~WM2>G{^5Uij{lmC z*~!3cf`GjtBQpos)F3svBeikuWv>PkT)clipWr3L7@{wdVxXlT%AkKTwYK#i($ z$g+0Y?WCUOU3q@1fdui-|u zodzY0cB?;Wm@CZYbbqA~+&Va5w2SGpiUb#Da(6^MD6<~v!0wxaDaPC{Od{7Y%9Z|= z8~Yb$niQh+L7V0-EG78NDI5Kue%IF4{u2eWwPgW9C1^g4U#Bs+`xIm2JU<~(;fIOG zcczATCOo!;%&&9|zCQ@GC{H_8qe*0}R4u`F_RH$+2Cbfhh3M_rWs+ItSvR!CU`Ly_ z=O^w64qtokz)<6dmbEJnxo9HGIvG|&NqzZSj=s-Do-f(3Z`810s`s!)4nCvTPKgO%Qzv^`W zlaVLT8lpbxAFSD0_v<_1aD`o@ue%obzW8bR?=H6#ec8O-;2B4OIt;vl+lE^aV9GgYJ)1fZeV3Fa3ea8AfZ7O*Baii!Ug{kfAFxu>pnyR;Bo?)^ zD_OWx2|m6+<_LnYm3})5*IzO>^k1>L$aK94q*O{J$6PiDXxg7iFEEhvll85wXiy>L zbYL&o_CJfLx>@DRX5DS-d2IjON+>1#wYUroQ_}R)V@pdR$w&b=0fCSHbmiG3v4V(5 z)*9C6_)0%ZUZUAWbwp4cj3F*XV~+2*M?3B>>3C{NGtGz&27oy)ni}5^@DyLKjbV7J~JLSR87ee&*M8D3>o^e&;c^ z0K=K!A@CLZ$9K7OWJW5o1V+lFzm>BY4bUPHxyMxuDB3~IL&EPCbGlL4gM^!piTsp| zHGGWH?bISl2v= zL3!>O(K%2D3GUk6)v}Oz!3BW&*8mphl|N>ytp*z(R+#;SVW+g+;~2rmzax zz=PRX!RMZL1hZRNXJ6O9<~%k|iJLr2jeh&sfW5+3h1;`Ls|C-}yo# z)By{4HZ$#|jx&JR#6(x|+%YppFo~sGtTtmbz95{?R{ceE*c6Y-Z$`1)#pI;o`N@;Z zVQ(-g1O}Y(yeYI9;$F%DsdMt_-JSHqez}+#r1%W zlk3vga%T>tOD(1*-n!{sO56=DagOC<9aZD&dNKRI zBrxJcIqQc+Y8dUlfZqXxDj*eLBR4THilk-)@F<4F|2+gfgEiXpwBHcyZDEx60WL#WZ-nn3$hqd*H-~z;>n~$*Kfy{wp^8?OdM`wp_UBH!x81D%7Wv2qlvVLk% z)!ZQ^?E;Dz;~6xSBDifcvFQIXmwALtHJLZGRK~b|eLA;N{Jf(aJ1K^nAJvUws}c2w zSUHQ^+Z&W0>s@{8ljle1@E3TrQCpfQXZaA=@8-c*hwI+ES2E{ma@{oaB1_or+yRG6 z0xs8fOS}t>E~u<_atv1R7|(f^;&=bnh3u(fIYdN?E@wb9J*;I~C1+hEkJmB+-Ge?8 zI)1C@&GFsxcG`IVxKAG?k-{cx(x&I z87-mLqm8YICC@Oz{Jg&Tg4V+9L%Dt36$Xj~tMghk$v97D=J%fEyC{#Nr6Rf( z78h|yNu@x23^S_G!A=}nh0r!WKnIDCjOyTYN9&fTLQO++ZG8D0_5YG8*+YHE^>I9D zajh70F|bDJH%Zb4-qg7J&}C~|+vdUVd#WzN@*|~Zu`*meOkP@d9s7T1=OLqOHa4U| zOaPsHy+y50J3;)7{{k2yJt9T@ns@IY( z8%?0fb0GD<+J~Xrp4N{``M+pfqbODA&WLyRN6hPvn^zEC+?eyNHe-*zaqHHfVN7mY z^x%x6lPAc|7orA?hLA=k(dIRT)sxg6wLZizfo(~4joz7iFM({dG zbmG+X5Y6;(JHwo+SpHj1%u}4vDTCeq6&vmq#9=K}_b;d~kYf1C#E85Y}{r=DFKNdl@ri+dS4?lMl#$~jXGqsX@=Pq(ehoRz8CO8ElSqBFX&oe8FK{z=Of;M1gwcKLn@*fq5 zAPNYroov9wYySsaXa}TOX=e)kAvC;) zRT?(*g+Fz5y92p~T%SUyj5POZ%SgC$t**v|?LbGi2thn!L}x0_kUq|z?bmR71kMLQ z2)ajGGvKZGZNQbxVfqqWn1V(|%f1;#ZJKd+lsTdDf$kQo*{f3VDq8k2 z!q~zxFQgwPYtL2Ba5u=^J~`X5N285Dw|j=d>$H~uViAa0S3}OM1FMJ}Sq9EPR6?R? zZKzOB0yu<^!NEkV+9VSnk1!X_Fcw;`G!wwdx0pTTvc1_A%~1rxeiZOEYKEw{H~S|Q z7^@C%RxvniI+JAliHlvZZ*5c>WInrDTjEkFQ!TX?J#CljzN^X;JyP`u&fCNyH9y>#$kwWzEaU}qXUFz$9xz>(&8M<6 zJEfep(p=l6&pPF@iv=w{6ih-EPgsRl7NWl^q+1;Hfg0%OI6Pj=Mv9W|p3k>eVxB2s zRW&LQ&1<}&Dqq7S^p_X+`)AMt*|30q9Avn0Ldz+0P&*T5KB@q-!9c1v+zGvfdL78x z2AYP$AG3AgvS&o-@NO((>D&?Mcf;p<0RlJ#K%6sQg+RZk)n&ZpsoF z!S7x}#SboV-=yQYt15N}`=%6B%n5_Du0^8FSS7yZO##Hl3U{V0~= z6i+1EWc9DN({<{5owJ1G4?G@4Elza*aZzL+H)E-|G+L9^BTBT-s9LD_hy(g~rkE`) z?4;qI(wA?L-~=&!4362u4f<*q6#R7_Rg}V$>@(I~NfezZ%S+L-^r~HjPT|7+t0!(> z*!td5suofjv%NaBU6+XZJ~Zjv^?PuXwWn&c2>ihfo;X_89}4=Q&hx6{#Qi7>@qEC- zNHrt+t!W|TvqV~DMX*5P^_`Xo@%t~-xj~sQeOMOM`P%axue%57h13PjZ_O}Q{9osI zAv!&jw*|JQKis-j7{+ck{v+vyFUs5Vg}mQRdp`qjl80qaUHA`oFr}rWeDo&UIh}jU zMuhZ>+dFAvrVN*SnEd48yt917#WJX~c5H0S@pq#4<-<_u;DXU0w@pRj)ttaZiPEty znCg;oEU)cU$WX8Xd-s$z<7uF~Tqzvi>wNOcn}(sYqqQ3EAjj3_Ibpn!gSzr%SJ46whUmHJ-5lm~y6GA*gH-f+F2E92m^;>cqhjcquR62#%0+j~-!-a@@Y3HnytozUyZ9YhymAqV#QP zsrO#F&J3x`SxjEQDmQ$%KcBJ9;ktyS*!AandQ@OF3k?COUwYJ@qPN3esi|EOm6D2` z4HRuam%sAVmbaMXrWuERgVcq`mm*=?D1$ghufjqW6tl9KYv2&{;Q%8qe8o7oS@8+? zm4$j$2}r2$0ckP$J5Nv4nuG6oek`DfIlh{}qJC}dtxTwGj& zUA3l+u<$h?wm4&@t2CL|qx~Q>;+>&AIe-Ze#z!?YG*|{0+~b>>dpSlinYn+;HW+1m z$>F3uP4#Qlz4WPoLqtT%vP3!U>HS$7OI+@oVn-C4GeH(YEh&^FftGYPMD9~D$|w_a z@fLl1ZZc63+T*$bg7p17A_e&BU~hoIK4XzV&!QKx*>z&~6wwxty0S#3I^7Mjc)*BJ zvt>@JPCjmC=IO3}v-N^Evwu%20tL8Dic@@gd^)Xi=YdA?<91~aPpWsr!^3r-gEcfX z$|g2s<7IH%(nHW0)xLKBs#MNj=;%=4=l4ro`)I^%g81G1Ap2-v4dEujMS5qcEH@f5 zXdvOT%W)}|K(}t)f)gn{B6mBY0CoC2Xx#rf<*Lh5oX zn8Dl6MR6#8f3cYB-)gz_X6nk>x*5Y$*9X}bT41rrTY$JUNW#`9&Vy zlR2}jOZwKW?N1je1G>w8o~sy+>JWvK=id5u^W(YVG0v3{uXmT0(582-<4 ziCnhiqKK3Lf>39r?OumEym>|`}>sf%N3zVQ}6Oa^$mBj(6)ExCMztSV>~Ek>AT zK2LdQzg`Y}H&lCTZ1Yl{!Ehs*(;%j@Y}U&R(el%Q5s8N7q|*_D?VI#!L@s9KpOxmB zX}F@|&DDRzZRRc#hu;ahn}fAHw|C^Q^Wl}!#U{`A{?R2q?h&r0NUzoft8F-gc78kv zz@&R<$VRm1boRADPf7|afbWDBk}+7MmyTG{TN+Jw&F9iO+}oQ+2$B`A)e3u$-m<)C zJ+$8URW6!o#v*yk_xMF}$e#lW&H!p!>kP0?d+WFeaEd&jwb74b^7_$KQ=Ud;7)`v3 z)qO@9!uK_OP4888am21mYEVl^ zkl~M7qq5f^?K(X74AVMCpR2HKGM^M0=u5eNU0pEoC1&N9D&3P{yyQ7}X3L=$$Bj|$ z7?<+MO=tF#A5||-9cRaevnL(fD=QPo@Euc4;61l9Pc8dgYTGUp^8pE7X8irhGTVb}Mp38)`(yVU>7EoP8gYhvPq58vIlylqSs`;$y$_j40M( z_>STHR@TeB--i8^FRw?-D_72cs#e@;@#~?*KX=lfTbGrWkw!@Uv!5T96Yd2_Pm>ks zRm-jP5~PxUzW)A-X4Wd`_FMDAyk~6kS6;DPLudNPRC7s211+`U{2;S*ddAkGzv$~% zVjEG&x~a3z$jC4pzBTSTBGH&blJ@v6b~PDC!}7PKh45v^V?DN!k+;9{L=0KpBNg&S zU$y)xS(x7{j$1uvvQcNASUo=$YcHpG0NkGo-{-b*| zwo2oay`Vy6v8n6E%Ej|tG?&Bb%|fGJ^l)#_cywa)W_)`|yz#@i?4M&)S6#dzFp?}B zxA4_C@N|-jiuiCk38lytysp0t1pg5rj?1pOwdYQt@Z+bR@?{nKmVIZ)!0jQg5oL1A z_~na=+eCcmW&zUgiQ~Lf&m{7}5G(+oJ?WPF>)Ml*EN|YNEvJl{gVC>RRHo1QisGGb zuJUa5R_ihRny5Tmrn}-`y`Ex2O=At-_15d(hx~VMh^Iavk6oLG%VoEvAgq4o_C4lT z9XSN^JLP)il+C{_Dkl*8dG1)_!~0IWP@tqHB)tAHClbXZ6T?|=-NQ@p_(WP%6o{k^ z^h!zYydH_@YQMeTdkF`p2NSKVt?@UWvZrIIN7zyrTm7<-J$P~VdkZVCLL2S>SulY( z;qn!)#)oPSo)8few{>?@)6jUQrc&1+7MRfX!PAV~AofYcLN`@ffN*l;LH$Fn#=HH~ zCF_sP1uZJSW1Txvk&NKHB|4%w+nQ;7aPkv#sXAmQG%tbW?x$*mmDTw<2fAI2*v`;7 zdDNQ8g&r=5JiD-$E8{8)Zi^T$FY^huu{D{WyOQy2mPbVQ^gK_yS*@`j>CSX9sD_yS z*!c9Uz5Cvq<=yxTM1Dii;PQ+`@W)$5k}m3F9|sB7p0FEqlK|xbW6#Kq2P(mcX#*}d z1#!hs`KT|=Gj2JV91f3ITA@u%>}xPOE#9vwo|Aa$j8Gk2UpFvH50bt2Uw4VL=pxRM z@)zf$8ShJFGhW0;ze;QHhYz8pg3QEp+t1Rn9MZ6tYFeO<5p7d1R6+2OfJe9dQ=GQq z#-$IXn;$eTh%3zZZ@pP4eU>;~&`LH3iOlDVgIOQiwa{2OoyF9?%1oA+(uO1oyx%&e zovH}62W?~d5bhG-!)x@RpTo4yqPpGBaVM&9W;x8MH!`zcHh**y5W634=NsI57&l)~ z{lsQuym1d_p0-jo!EBMZR9z5ygrWgEcUS|~bKb1BVhyWLK64IHYEHE-JQ^Rr$8xu%fdv>94z6FjxX;H>yCX03cL*EuFl)5Wj7T^t2; z?no!A-qrp0WdTxYupAw<5Sl4dt zD^Gw7cYc0aTe|98C3l9d%B-r+vWkPOZ!A}|%Xh{JpZ(Ws7PmBv5&B$487O&oJ#=fp zAkq5k4R>v_Fk|wWi#WGO)8!$dq0OLe5`mi>k`~fxYTQ2IUJC}{1YkfU#BqcZ4(b8X z*HbT3L>d+v5C=Rq<`W*aL#O8~HP4&e4_I{(YX)iCx>WsXo2XwuaadtO;0IJ`_~a?i%hKY-blb2T zmBHLXq08~k?EuTt;@3{wR28LW^{$2Ny>V7Oyp$KMu{vZQbe(n^FqkZU_fkmmksie8 zRoOOp_2bg>`G=}8{a7#{2H#GIDL9AI`qE1|xd!-VI5u{@AMoaqks*mUEYq992QLX{<`=4a|W{>9O3`N=cGv1ji^bCikzJ~GxKtoSaDd#p$F!V938om;+0)XjUS4s$%0UD3$w zx0?y#BeoF~-8FK%6F={-|cYfEBm_6Z(KlR3_@rRQe0p6obE`i;xWUZ`1VzHDg zZ%y`m?^%k|@K2R}otV<-8XCe2!hTXS$X((lN3QaM@xg-!y^{k2Y`ME$tOcVgHAQNT-tc^r{ebN@n8)*Kf`a=gRzzkq@>#WUt(JiM4-a zQ@HI?CxtVKBnF)`2YR&WUPfHo1wRx(-ZkYVpO`dadN!vXF2RKyKZ_}-W9vj`&RoPv zb1C`fC+W1tWgF1DV+#lsk9-$MOrXs04ElysQ*o3va4=9M z$zhrd1k169J>fF`!LBb;|09iKhm7%t2oi_a0 zDor4nL$~+gOGmb?tCqaxFr}y{!j9EO|j%H8nY9s8orJNCo|5@@X=O&H| z<|}*x{M49|GSn?Y4GxT=*_T-%>4Qcw(li2}J`sdp3&xFZT*ok5I=EWeuWvlJUQw$0 z#8~kqdGuIIvIu$evhB{;mD&`}#0YpumDr-FB+n$meWzH)|M!E|+&VD&j_4Pset$Lk zp4pu^nmobv-TwIK~S|1V-@dLWWoJ%p*pTMOPY=cZj z3a?V2(n!a=-khw?24BJP#;&qWzAvl4egen}v!D8?dAp&1T(VWrJ#j9v7W=ym-F7s` z8}(YU1-86^xQuiKdKW%&vr-Y3L3eU^Qu<#Jh&d2|aF#CEd!$q0* za?{gM67uscK8xJozTsqjAc?pGQw+w~D@sk8ZD)nHq(6FiaM&lbwMXTziglCR;yZeM z?{u9(>sb8dxJoQp7y=<>!noN|Qs zPa{oEJgQ4nRQ4J5Z&Lr8dn>&UpQjD^NI&7O{Vw|AKQFq@y|R!Ub$&zcVM*<^_3z!kY&Ow9LZth_;d6e}%75SU84!j&yocDNq`c?vZ++-7 zN?F?H=+*s(JHdmtCwtLlu06R;-6ElMyVQ}DnLzWCwpVJI7Y|&LOM}WhcRjKZS3CkX5o%H zk|Guy<<-zonH;}Cc59LJ&tt8G;LWm|+=D~37~GdeBPewYAfXOz8ynP+ z9Fkv4+x6pC?!%dy^^|13CFD>3p^hj6rFJ`W{QS?8X}qYl9!_Y_cYa&3V;EdlibK@J zKT-_3-LZq9RKlYy(6zfST&=$*V?~twk~SiqE&f<+_jr^`E4g_Thw|a2D*20>b^)=6 zD^qjTCxbfAjU;9A|zUd^0!i?c%LH9Zzzag^(~xJ`KcysFKu?& zsjHKvRND8ZMr!HLzE7_bTOJ@T%z-~l4huL+UwCkCaB=Z6NCoFHD$^&XPbvWn|G9Bl z`sd_6OWgMQ`8H;SLiBz_YO32*$lBW4bnIszRbJz1M`B`Pn)w)=7d*yyM%+(uEt0+0w(m6I%5!_rb!#0v`9tmIK&!=wcNlO=maJnyExJ zVo6wI3%vYk^hPFcxHzxk16k`9p(|qm(>?L$T+G~0B$g;oy6Sg>BcsiPGQM;^jID24 zo-O0PFv{(&C8})go@}W`);X2?o7z@UD7{&G55=@_Y}6~OW9MR9RikR~{EVE!N!tI5 z50O(35(R!^-44xdXjM?zIf@qaAa{JJEV?pq!cSh(;@$1$#%NSE2t1hB?A>i4jwmr{ zmVG6@R$=?xwOXH5ivNhhk~o)2e?c5CYWeJqJ-6l07X(m@K9`rrH7kv7C3lZ8&Mo%> zYkSWS*%zw=54*+g9;hQ%5i%@b!Rit+8G$NX-Kw1#P#AnkcqxSCrp+W(eDyztah)@{ zhc!mJ&!ePC%%<{N)ndpd>5Uh1ST0uVZ^%Zy<2r&9W06us#6)cP8K%({Kwd@A)F6xZ z4tT|<4Og8@gZFdNHOGCYu2sZJqt=^&yQ-1ZL2!MSJzf<^Vw06;bNbs<2#WMZo)|Ev zaT$`Yd72*t3f;fLKXBs0r;vGL=ks>OO!ZXVc-B2VMn0_zZ}SwEDsgxo*(UShX9zDW zhssJ>;j8ZN)<xq^fuw;wT=2l?m@l4Idbl5{B#oW$EYKf6ia^^{(`P!x=gC&yS^- z8Uut_UV$`%Il2o4{BQ!JqPU+M+}Zl&3V(fSdfL4zo}@7U^ILn|PIgM*ODvVZ>|6a3 zW0z~4$Htajj>v8J5VtT5;-wG<>^fCzRtn60s4j)bmKYF}OBrc#=nkHtp`o%$O2Tj6 zP=Jl#16I~xm`@!rdcqeUP*TDU#1!<1QmU#*C{CK=VB48hEK`FQ2aT=^_xH~!);Ox% zG=b*1SED-`&!ZR*6uuY1QTxD;y6L+AoD?;h8o$|VAVp9*#r!w$qk~k!!iD6%maEuib_u4$C+Wz1gm7w*>OYs0s2+$3bo!m#&#aj1J>DdX5899$fsXndfy4@#SaGm=8?w$;c2cAM4=CPP*ioOdt!6?lcYGW-f_|#@z3L! z%ewWk{@F~f28*v@kfEum9)R^fOt*o9&N`n0JwCX<7{Jj7I-vOXxBVrvM(QUsZyqSv z<)>A|R!E;_+*)~&qLBZX!5{zXIZK7LHEyplb$ZB?PG&BtDzytn9+?eYnK?2NO|+qT zt;QZEgyYNF9DFsd5o_eOR{2_mvVfZ7ME4H>(g>XGm}4Zkd{Y{E_qoIV>HTkpVtki9(zSpZs! zoEV4@q1guPkF19QH``NErFrR2KDF>t`(i{MpR3Slt;=*45Iq7b|gfK^S*R| zYZfH@`zj5 z-eplc!db5lJLWOt6YIetozaua%vp>r`EO-=YoemfS`_64$Da&g*E-@>G&Il=Ep)!F z=m{6NPOtXpedzB4Mf=`4PFp)A@}IA2FZw@w=!PxO$Oe)5$`RIF@Zd_p>igc4M`{%> zYdk-@=8NoNEkt(o47XeN^TJTB&%gK8|3yBrL$!xwP_H0r*tCus2!3>Tla-d~o(n&@ z^>hp6ruUV1AON#|?sh+<`m)|w;!NG|?$C%KCN@2$OAC9nSFDuc6HY!ec#Q7(-nm`t z$XcMR?%)14CMi%X6+<(AT1Dqpr88^vLHzHgjf1$8I!&A;HYD&?-4wS041Rn4#v(4C ze+juB>=|fDe@ji%|Ka8o8fUzdHzU&FW}}Zr`_D4FDpXZA(A!%Fqn-;-+0l3!pQQ~to)+ID%z9IDcY$43{zk`aHr;rE zJQ@I}v+SnSMs1F?A%R-$l!W|@0Rb|r6<73Z;s@+z4*bu<;>ofO^8~`ABVFetD&F*(B|fUZ%75Lc$a(GVK;T)_-)38+BRtz4(2L<(7bipDB3ooM zNDrnV>FMc6y|d8udU+Y%W{5;_A6*K7jrRRJ{_D%(pTb223dYu-g3F7_!}HK=99-OT z@VgR|lq4y<5VbTc9dm>>$Q8qH&=kHn3b?rU=#lBsB~Hng1&vH>ifbt0jz=}FlDL8v ztpz#&y*@98DsM$fQk*_1(nkW3*Xl+3=)wf5oyEs}i#rwxsaZpL8L^F6)Ga$VU)#yy zbvP*}Ro-LrnZG+Rk7f7Su!OznAA&~Y5&3rSsCU`DIgdIw3e=%02Q22OyT9gh-`I8= z?ac19*h=N~bTpFlV%Cu^O;B_-Q@v@FjAo$fzk+PDQp(M;%r>WhZ*dS%o$&Z112x7& zj7WK&q$H-4l+VUgD#DK)fGfSW6W&AWj(@)idwj`8d!#&g#P8kfTxd0xUkgyO1IUG8YqXXxm_dFt9U2f>9`hrSm3 zMu7w+ZoOnY1J%r*?FlCg;r48wD5%fkU^MvyIN4f5J3aOn7f5=7M7t?%ZEc?(&j)kH zo-rta0cQc|JU~AAcpk!>!;uq-MG%X)va)h{9q23Lyy`Hx99Q?y=69brid6GS5Z`cKN04Ozi8q zMrpbId@DMk=qe$kn_51M{E@r*GzmkJw`z;}-@wjw+dB_{X z{@|IBxOGxxGFcKyFOQFMY zDmB}OFb>nxIdVhy>E|eWyGyvGj<}Y}8XxJ|?dZMee6UxzO)n)H$|$0l>MtSh!7Mo| zX|tyt&KSnAzviemq$!Z(lceC5*;QN;HF@BeuO z7W^3K)vQk~5B~bK1DG-hN}Wu(C;%&1<)qMh_HKNxrlyv#3BDU|O-wMK<^tmkNNcOC zygV{9GqY~a-~T2(J^j|P3o_CVe!8qm3i1|c`eZ9HO!rK~0gzBxNxbFEsi2^M>Usno z)2SI5bZ;@Q6!AtyMWyHF-du49PT_>D5x5iN@25y8Wrf1;!&QkQofZr~NmfrX0>Fcx zxVcZl-{oayCY6_S{eD8u>DAIAEFHsn+hP3IFX0z2UQAm;cMl&Qzih(oiar!-FJ4@= ztGrONgr|TXd3kwJ9I7WNT6_EYv^H#Z=EYW4S3k7$E{$9{cN%)H^LuR#ljy^U37Hk9 ze$j>6+kjR<_$_GHyth_RQo6SCw!U^iS5Q!J+7kZwS_eHH98t*9y%x|~tKa}NA^eZu z5B^-U<3av_!(2Kh1-}kT8TfU0cE2|kUw#b@d4GA6es|yLj6oEOAo)Y{$7e@xm3M`Y9irOjQbXZ-7hB4Tp#O&A+EFg6o;Nk1lk7 z+?As92tDkz8N`i2bgo(~Y~FLn65tXzmNs~@c8m)`%h}Qw`Rf-md?)CkdLox~I4ACs z9=Wq{3XANz%c&}UthOI4tGT=W);#U>Bf*s=OA=o34jFgh=5}A&i=BxWmt#Gl%^0|{ zuy@>9RK9+dp-^ed?|5oDv50(j=E*JVQ^Z&#KJLSyJW;LNc*($d z1Xc&U9Gni|f+Lyy7J_;H38Md;e+IXv?k=@6BRRb=TI~`c{CR==^fw|qoCy)`eTQy3 zTNiE9jy7c%!5L8{lvNBaKUX} zeZwGgaR(Ofvly>2^_x2GT6{{vZuWN<%O=dss`r(E#jS-N&j;nOs6=Q%pn7QD=aPr2 zH4O4Sz?QND^mmtW$DT2z@PRusi1I+d?fDUdK)ZD819f#<192!i2ID&_t#8RSME1*B z-|XEzwNV8vKIV|}xnXv-1_Y+DPsdu{csPFjI9&R~LBv-eL-&{9t-fkK3=O85>%8Yn z_hY21%xnGD*Gn@_&d!=lM?KAtS?;#jxKj5WQa24V42yY*B)&HUCebC@sP5cZnp3&-eCGw-jq#rSn-f6AMFEup(J;q_AYmeW}pHkXz(2|%F_ z`x9%qd*QvVa$_IIcuqrzt(?PgUKEHbmAhf0~Ie@SIQR#RC{PVLMR}F_|;C(2Uf-EgYy{4BiftkCB z0k@bnYbt?OlJlz4c=d9xnASN?hSr7COMV$;0;6NoVqK4d%Br=zWnDP#owG){#$PvZ zc-a$|i^*QH-qJf3qil_t6wjaeqjY(>ec74#aCWC$dPK{KNep>G-RBDKyldG8v@HP0 zKVge=ejGbTa{)4$WZa8+FWmCQ!uGga&}oRu3RE6`u;*J1dnZDaAn3Y2BG`DvN4J7K*+BZ0Kitc6u| zOzg4o;iB1Ys8hv;0s%$dQe*5twTZdZtX@OtNJ?OuczhCc#vH|VfB1EUs!9ckiFAkl zqfhe*DT}#1+xP6vGE$P(MiU?NI6ehrnpkrX9IZ)z=@!!53&g*wxSYEhpG0VH< zDD5g~LyxDmK@h#Dj!ASjA7G8Ca=4y+P%?HU0G|kpIXsubtzREHdV;rS`0U`9`$O1^ zf~yBf*Onh8XRR0eov5T-zN6oIsLxJiscClq+WMaFV4~6|V*i@86y+?f1)jowXv8A2QT|gq;2jAVxd8Wy!M?=~QEIdv*}I zz>Qn}%64GO-0EK3{s+7C|5nmn?$A6r6RBN_7rEfl)`}GukrlYDrGZckGhmE=_fuS- z`~N-!Xn2|UkK1IB3=C|!ldlbZ5E}Lu;~@SnX4WsQ%RRdE&R5$tuddQ6)nGgjw3xAa z<94VasaB%xxLJ*jYw<39w})nbY^o*y^*P>`mn{82U=HfPffbG9c9kc5+BV4cw`E`9 zOX$-SCY654rF?KF)M~}G_ZmxBb_$=HHO!OycIbfOSDZ(j>zCt~l?|gR?cakz=QSIf zG4Yme(*je29j2zy1MfILOk(X3Vs7`*(7H|L{_v2-yrM zTX=GIMLw75mX2L|eD}V63TfM(PD@it&(fhjJdL>4`|h85RGz*RWtA!T{Uza#O-DXw zrBc7un#>yexv(JBRz~ehh#THdk|*SL!U84m~@7Cupl&=B>O|r@<3}iIE27Z~V*<|gED`_&$3<2qz3 z&_?3&(4>C@{kHm-(dBhqib#k5IAoyfN*xVb;(kiEJdSGTk21W=qi94P?rarv4RzS+ z*WaJb1uRb%hQ^TMt8 zR3xoR*q_A!zc2|7l4g=|z&U1(E?+jO!dfz_Ief0v!Kiq2VtDkS|HAI`}staGU3lG0S!*j;)Q?bOl ztlrZcbJN5MCmEqF{~S7BL88nn)TY^K(ccGJUyev5r!Zyzi zo;JIhFCUg=%%>)lXt^I5&)ROk4_yhuYD?QEq3plKrTkB?9qKmdiu_T9fWRG#lT^^c z6ccZc&?7HaPS7H+S-klL0fk9DgG+|Iyw6x!AT+;zE-p&{{7uP98a&bEO~X9`EHv;` z{-qJEHZGP?fqK;pKj9!0ih0iZMrRDEFLT`FN!#GOTwC*dD7#C8XQSZAGr-9jxSIgk zQCXrbOBODvr*6k{F|kf|33-ik(;jJz*LUp2nZG?|GSp1SHq-2T+LzRqT0H;qugLu< z@Go`Mwtf&DXT*kksIPLLkcov+>B~*&VTAOqXRIU)u955) z5QTa)bKS`Py>HCTz(bbfv_@ql_{P@BVYiN%#Ekr-V>wflJKR3Na>TyiY?B9>O{}FNl*hit1 zmj9q3rh9*Ghb;nIaEmG-CgN62n!xq&oZ&G}4KgnQ-M%r1+LEL|)0Mp$Wtqlq~d z5YBarD@;7TeVM~H(s$z3*rSt*w|32b+u5My$SIG?isyg_a`;5YVp7+0*Sv8js_%=V zbvhqLpIHi6z7*5YbRi>J*gH?*wQ;JJ}^j>>F7=V zj{kx8#x7EpK19ek>BDFke>Ebc)hP7UoixV9&D9IONWs6#Z7cL|@BrUHRwkN?)jy0q z4onD}+fwEG*31yySzrXpq@sT=bP>+{=ughc@k1JDK-2N5po1Wm@L!>B*dsOJSFc7r z14LM=EgW4)GB@@A-ZsbIN{GXUI9BH5zaVj4!F~Qn08yRdYXpo)vEXS*xc`Yde+>2I z{@3Gt93o=@*Yr3QZIx$kCtd_BrN(yfGo2c|9NS&!*)?2Ed#||i8TYeKpXenO~=&8ohW~FhDsV+V{?kVQqBe3ueBO?TD^HEpy z>VT1w9_P(!?;eigy9o;P-Vw!EyvSKF*Tvz!*6TEIMPQI?SK z4kqLRA$|y=2!Q?o2}OuFm7kwqNLZMhrY4yUGTNPE!Uk7(Ztvn5Q>25W9S3>bZ%hYv zQgddRm3_17d*Sh>DvvXUBIx0@+zhyq%zorDNvpv!)294eHoU4s#uZOb=GcDif~_9+^I(5} z|7U8Jmdwz0BMvRs29n;q=9n5@5mqr3C8Y%biQB{stPdz3^y}Ld=2bkH9%`v9hu9j7wumrikbNqw6c9s_NRc0cj~|Nl_Z4ySp0%Bt^PYx+JAL zrE7yogM>6lcQ;a!N_Ty8`#kS^&iV0;v47|f$8gOx*PQo#U!~D4KSl**Vjz0xwX7Zq zfq5O1ikeouwwRp8KpIKwU$N382Dm73plC|`C+Z>e4j)(kU!?pyE7f?5NaXLD57?>? zAAMBF7a#83o4&J_>jaD8G<`{&-QOaAypn!lEL*1O3~ab-9M(FYDJF%k0~!<*UuppT zDsTmC1TB+kScZuLU=+*&D1F6Xv`DkYd1?FZ{@mR0IsomAXc|7!;f5k8XGQJs(t+p& z&#-)Yuedci>fUW=%wzj1CK7QF4RJi3EaUN0rLa?4S$c|Be8xl0L4W`AapC{6U^zKC zVW8?Vxn7DmG>IP5RHMTS;*TjU>)eQ#?BWhu`lOw|QCP3a@7GDS-43Oa5+(kF6a)mc zKLF89C`gAUIC?5yzvT23WC(N_Y->2aH8dRE08v1LTX64yjVB1(a3vX4U`u!gEh4wV zS~%~V5BnW6_U>{{JhHV7aJmodmV5>_ zHe4LeODM=12acKspo?a&ar`4~f72Dr_PA{WQ)k$vypRDR36Nu^_f zcjM+#Yy73=*SHFUW}Ydz^h=Rvls~3T@YDzd|8!gDt&GpadrbSgZrj_ATOC=IlO^>^qvdF7tXWlPg zULi|d)&SK^V4+c8>!HCTQ`pVJ#5_(B&Yj7jGF;IbD0qjWKS$~Co*GF1e|Emp!d0`% zsx`8~1*(f?%%6s@&bL14@e$NNKoQ1e{FlmV)A3dlzj_riuJSoa@`EoK9f5wgdBt}O zZ=o+0E%;hnv{A!Ixv%?Z>=(HV+>a$O$c50_0b}Lrz+9NWeEVhE{QvOEp10V!27K(C zB1D~x{FseU!rDFb;*lCAVRDL@>m20iRU~i=ol2>+rIZ@{x;^w0E>^6VP*2pxrR2fu zCdg3@PTrM+wytb{LAki~c(BLNM361`SL~5ju8w!2<>_|_(yK>4izXyYJV0H?;>dCd={sHXy5IsR9HYVSz zb)&2~E&g8{VC6r>OsK59?=W#w?$c)5IC}yQS&4+y623oYmu7MF(6@NAkn`ndp@6x+@ApV%?D3@N?;#IZ5YNQM%pBF`-{%VnMKc3ky5Vk#Y&!%V)vRe{Y+jHsG_nemo34X~0lt}0Ml|1(W>Jg2X5JP{f`bgWtre9=8!-5#DZ)%b1AcP-8sn>m|T*Jl} zj7B&FS&Pv#lAQ3(3p4O6oIUh4T(uBUl(zJ@h7>c1U2II;H$48>cB1FH;1LhG!uZWcWTaZBa8=OQs+5nXkAD5r3T# z1rQjr+KC^50ksV)D~WD{zkKTn9E3(f!Q)aFc;9mM({i;}yS89=g>w0kS#+O>sT2&zcb5R43;L}n#V!+@9 z&ekBCAc@UAxV3sj#gOM?4(ROCvxpvi)>l7XEN-uk2yQR4djw{|{0}1=XvPYmRT*Xc zq;!60-iL?b%iHni0k=yk=gEGH57`@B@P`1n0n|r;dj(DIZeDL^L#nH>{e#N?ux1|a zbRskek{NjY=8Ps69)ZdN04xGz4Jm_o4OtRw2H;_PB_-(uS}!Seh0A;S+Bf-iAnzyF zZS3@uiTkvPFj2;Jl1F4sb~CzjToZ;BO+Ont(AytJC?}QSrz=FWX0|P_=Pgh4x9*Mn z+?TRML{_>FS*MfIm-wUv2+#iL2*lbMO+2(+yV~2VLrl&xt6x*U#(v{BXnO(*YFRbe zb^!rSdd`;y%+L>7f@#9JBt_;}&&tvg7K%ImTm3j8Iw7KW_+8Xwubp0x?Yekx5U>6` z?j^@$M0m<#J;SI8#;K=XtG(uIQSMBpu3WcvjcMr#G*3ofy%x*_<;CgU^{8KuRUO<> zKKVWJ7G-LZycv@wB-E#n2AEHpKxotx%_u!lwVd`H8%k{I&Tjb5Q|35b7HKx`91#^O z!`v(C)pcx4z$1YV|D7c=_Gd5@xt`n;uA((n&B-3N-2fe<2-<+T{1gph(o_;3!e}%49wzLPjm7tg^=zR@+JMt+M0)k=A$!W)Q|Num1!#!`l?b3PW4${uJL{oH|L)w1V~DV=5RzGtWZ5`GQuZ3 zeKIE^O?q}A=Z~2{WxJTF_>J{*U86N@+fR+>ibq2ZpzVSC8wqTDAr3KApd-0p&%SF`Z-6K?$)E*cXvASl%yQ5e5L+ zgkMpM;GBzgiN?wNsBdzk^KF!?a-CBK8&h;%ix-^K3Lqwt+|Ja~F+ROee&)|4Oy1QQ zyhFg$Ug<#5)aa~g^=fbobmmXFQbPB*7`C27b~ES>XB^bBK0D;ducJqWE1mfP@10q_ zj!v62spUcxj^Tv_J&|0KoxXNuuW0~{+OFHi67luJ^(v;-ZO5%}&Wawnh8j1G9fsA` zT$}R6zmvWMzmcJsLG%4OUDFHb-6_HIzqmTs{kSYT;zu9R*x*-hehGnVoMdV%oT2<( ze>~Mu1mjT?+;2fn9U|9W43CAcPrW`3|2~Y68;r{@Rq^2nj~9vxnO7G2Xf&Ruewid= zhJ{g|@`bDIqq||Ux@1IfItD{4iaV0kSaE=6**;EIHBO$2WbBTKQ##LzN3)^B9^+QT z)7HK^gjvM%?>2*`9NGa|wNoW7xHPy(@DL!9KwW=2_C*T6+mQd5Xw~;{QPj4r@rs+x zOBQrjFG>ELB=PaxmDU4msLVWw1Q4}4(mlr$I^03)Kr#wr`1M2`)pT=G>7~c@^%98U ze85i6E`$%0??jreDEzrM4xcGbyia|2^6ee7V-o%BYaGdPXIP_76QNO~rmW#L53A%028|9*#sCK|MFVx3$4bZ;kBi3565qiaGo zzOMN!$ch-x`RBFH*&_=f2V<+@r&`Y8G{4szfIE$in4(LPOE&c3^s}}rGScLNaq~Wo zp!~iJ*jF9CS1i$8(nq|cqMg|O_di7s{?d5!MLyFDk==I&6XG+8w6#B~XU4N7Ysq4F zfQM9rXtHt(CO7VOqy+(ZBpt)rFIl~!#Qno}gnGBLyoAY(4GUvTX=1N{1wc{)Y?zOX zkURtN>TbY&pA} z!@px*Ufk?8Y;o;TJ{Rz68Tt4G32r#W(3cl@8KDbp``h2$THG9xPu?HYKuC)CUIm_! zKXUiHDOI}9pnMjU!4TT}``od*OUl&#t-eR8@J30~aQNUp|2c}%N2$VhBQWD7prbI- zL90o~8~*~S-`TuqDmf@eqc71-u8IVpx|K>>#laow;dGnZz^SQ}d9`g9mrEC8^^3R+ zluhq{+!;)N#lxpQd}It$08mknq8>hD%VZlm2YkgGtlj=tueNTe4OvDDhu#c!JHXnZ zP75I+A+f&VNlk@5hu^zx=lnPNj1EF$e)6~=&-N!!+zTiw7H@0>E_l201GkL`-v_v} zrvy37bfP7*n1V90-7mHW*2atPOzl7kuJ zR#k};&&W}#e~gsxFIaOuIz(!f*O~PVFQ!IZEXKeQ+WB(gcO*+&iB89aQW&yI)EYR9 z?>zz-o~=MBoH66jLX8uhFI`106mWk;K?2rdcPHm6{|W7f`?oOPVuaT=6}ivJi#78Z z=({W5wWEGOK%kvE;m?y18%e?mouw&0tUgbkJtq#+`leI)y=6nFO$`>jQaYKaT-xF6 zEsahcz4_D^IJd819EaeBHrMK1g27EdXD8Y}sxLyrJAp0AelnK^_Ls zDYK2_y~y>~E{k;RzluEtQ#fc339b(q9i2OS#3C;d`Ik?@G*nhRe_LIbi0KDkBC_23 zGLv+n_e@EY?};v0g0ZomO5^7>3mJ-NhsVV9zOVXu><)K+1_`8Rir(283^QmI82h!m zhl-sHLlTOKK)!SQQ$REPRGaQBd4cZC^9+nut{O+5dClKUk|j`j$2xCy=c>{7DTY&X z)1}LA7a}b92#Jc!>$n=K$vRI~3uj{d)-G-H~A)@pi7Xge;gIm_s z!73oyx<@F<1;?L^oT#J9W9_wQn3q4@om|_==^F0<`Zdwt!n()oT(~yKR$fI1&nNOG z`=a})2UCMZZF%l#UZy?ky9V|*Q0dINS`aIsE&y?m1a<&;@h0oXSNADXI;7Tiw|p@I zC&>Jso-(oGW0Xx{G}QU5%>R-vcq36;!@#hNgK zQiJ>Zc`!(Od^eG!jeD^IHH&dDkS=0`d+K>>hYB~={Bnkek1JGClJTnzfz42((}^lQ z-jH16d(HQ_DMO0rwNEK5*mw~WXV1;~(=zjq-1mRb

    #DDTYRI%il^81;bV2p!qUG zR`3o`y$0{{KCB(PF72P5fD=n<`7u|S{Js7kzC={2LH3k<%`ZS5t!iHmSy5B|-s01@ zVfH+5V`;tGVHN2Tp{|fn?WQK{ef5*JQMQqn{VeQJPE;NLSsS#N?`bBTWAe%vvV_1c zh%F6Z*FZdBP>}*8H&BPZEcbPk1wOp1AZ#j#7F<3|{O-&7MTFZJ=QsA>F%iOr+feD8 z?{%$E3UI~MC@iR{XE-fOpbeL!mH~?Yk-LNDo7?-(V4WPn$$#Q>ja>^%_^dXt`0|ECxLAe4>{7#Km2rX^I=jkY3zWGMI{iNdMSllE}YEnkzg=0=9 z0=0H(8*MqXiHf;rZzXBu1{-K}oyb%_eG<40e11r!SWsjO!ZNRE~RyvLV(=z zPg@S4XQbcOhj^Lvy7zhNh9&yne{dFAbEJ`g-@zATCT4lgJz6>Tn_x(-?=hW_oY%sr z`QL@OFyJF)GiJNqTdzQ@d%HX%;<4O51Oc2w@o4hYYf7xzIUS3tx+B#=X$qgq)n@XR zCchQiZ(Qor`jVjRAKFpipy%CIB=~#T0;3-}N4idsVVerXO_UUWcL*?6ArbV%^#=q) zbu~5YIqq`%;x{^1Hxw2O8$0~E4pTmIsH7&viWUMhUZl9b95i6C{Vk7A4T#0WA zemXvq74<%#xrj2iuaS*SCVD)MB;WY0ZzZ=1WEzhJyMm-LK)-aJx9GG!J+n4Fv)S-D zBOg`oj^};G`vC9E`1evwIQ+{{@O8j|Pd|@y9G3=PJww}jpcQJ!g)$lDPdNSi04M^! zP;km zNmqpmL8r+0?$yPEE4aYSF3ougF@{`-ywdCV=ukGkydl&oJ2BKmUP6xol_ww^$LF;E zNyZnjeV8Cwb6VqZNAB$qcRBx~)zh7m4egZ>BwoK28~Q;I;~@~g7Uyhgl0aBy3M99n z;YYLLo?r6KwTz8(caAYu*tW0dv8?Br*Ij<*v>Y%+2nJjgx$dGmXmqU5qsxA}MY60B zKT0idl;-G4stq|nrzB@aZxDFFKmpgHV3g);F+Mn$W=g$=ky4a*OFeo>63u|dy(!(9TD;}1-101*(}?jRkLV1KqE zyYjT{@!|F(VHUeN23<*uSzC`wUt2+2KGr)+6#UN%(YKPc(w95TwBBQHf3Jq%W+Hm< zMX9=|(c)Uv)Qp*#2md*{mTB0fK9G+7$@p5bA+W{R8~^DyOriFR4kVs|LGf-XEk*Q! zIqj*i4)QFjeDRSa_Jjsn^zC7#&F1-zM{Yq3ZZJEc_d@Sx(0L7pNESQaFrABXcWavf zut!6lg?zyp9y4)PuAMTXwQWNoSGw+{iy33-qlj>H!_AyWJIqtWsz@zW76-aFxp+CH z`Fh&iPoM|Cf1lV$RrrD^*`3NjH#mJ}^llK#^i z|9S1VZ@}=jLPbl9`^y7>*PxNn(2xN=hYkb~Qi@uqiHY3!C~8- z)n9+rBwyclOqg%VW?-pYAF}H*xw^F{;(GdOkoFp0FvkJ>M=c6E z94$_qJoV#OB9P8tioWj8?O6*{Wti4WccweX_uI{!7HlRktfn~gyCt~w?LZnP&(SRI zkL*V~cHi?N$3uFv^Lqpx~SO6#s+(3Zw0C1J~G;H>y zeg7QnH#_JwvnPtUe`ttgj}5y&h<);K1C@gM)Ro;w`GSG-eD#$9UEFS>JV;u z-+a?4-mPRq^eR*ZHGa6Nw9Z#Q&Xy@{QcEd8%c>UJ7Ga$HT3Sd6GAAJg{PM^j3ay4x z?%C5|1jC;pjok30&hyEGU=wDXEQQ@jxca)%Seg1M`S)9P{`wY2ltkqCRv9p}p+1K| z=8Uia2(7>+lMH09GXcMhE;6k*Z+?W5`^uP_n&#*Okn;X~R$WDfWfydaFM}tir3xCR zB5+f{%9z|tPE2$PKVB=7V6;6e7t4LOHIVdGuhB`P*!RH^G&shEyv|w4c!}1_k7sz zd)Ockh_-&XW(I7o0N;nZoOnJ(&Ez46co}Ng(8a+pR|dQ?{1tlJwI9mFqdVu*atL%% zq$O3J6L9Y}Dq)iFOKB+Na2%ceu>u~wFNI5XOLABAVwTyxWsN;u`1pltUKRgNt2S>q z47?qqdrGbYdtJJ1#DWrhMmR4qEml=B@`eWCo88 zW2|kL!PNI zD2YJ} zI2uBL{?jB|t%Q<}4&~^UEmffiOBso$#a_M(Lmv8FeT!Bn_mcCJgmh#n(f!EB&%D7B zb2teht|I4!6Tf>Mz+9Sh)$W@(em{#PrZ9+^6bL=)zf|%GGG1SQp3{R|Jrcu3$#qOe zu>)=GKV5v$%Hmbh9c zV{jvXdA*>pca0!p2t&t{r1pZxlJyp$IGGGdpX*Wr-EV4#Q?6~tWGaCzh+HEZ3orkv z-!nO6(eEvS@92dbK70p6I@=2_SN*mKcOUPcse!ggja$>YN*;f3#Zie-xsK|bTk)st zKIn$2Nh@Ai*L>L5gaX+juC83bs+J#!%z&`F+J03CoakiQ@A8?xL}RMmm!4#u)S}39SUG$$APv8EOsxz z$t^4_egYp}4lb^VrKQN;4U{wW5)G)wJqU0H?yP<3C}hIp{Ve8%e#Cw6#!E&gEx+ce zyOsHsx(hb6Fi%=mLewjE3OIn}*EUJs2rrGpr&*Iu-%XkJHd%g#Gx3l2V-$KcN zBFTAgLDd3aLqxVL=(4BY=V^RyG|XU|r%?)WYVgT8s07nduJ>)UW%P4MR>;D|8u`%{ z%TvPutqe#k##PHxgCc7tZOg{e6%$Bxjm=EJZ!x!H$$yaBJLK)>7W9}=oQa2|Ov5$p z-YSuq$)9+Cz#bnf>7hQ*WyA+M7b15h5L){h2*7|sP_Pd`_%CRq4oBsfqL`@`lcVz7 zeruIZ4}jP*9*3VWT6&HMpcvT(Zvh&m00Ux?4a4QJ`za|_jU)+y78K@g&`F+h4LWSn zNJvVO)YQcPX{ZW7H>dS(6re43-X1|O7#nq$!p;nk+xPrIW4$;xLXMTWf@~Kvxcj<{ zcxQK$AXCH9vHPpG=ukLb7a$VkQP+-dba;Hl-L6t=%Jx8n3GQgm*v=MD$rk?Rm`wpl zuW52nKL@DG+#;Edk^qTtLv>JI?IbhlOvxJ)8i}|}mTJuCuHV0hJ@MZ>cS>rh+M;rW>@ceR<5IkZ$mAlIo_|azZ!y_sTPjp&UcQg5;(B$lXxf=h=?SME)di%&FLgE*L~`sgMJs z2-*+0R)wB@U#XAG@$2SvR2akuQL}j;(ERSMVm?MfgxVn0du)tWPsg(?ehsB{0m%oE z_Br0W-HAd_#9)gjejHvD6HLgmch4%DwKE06_9-B@JM??}4D9}3Fi81=y4HXTR4h<^ zf%zY3b%1e<-%v%GgcCiqZ1-e3e+fWZaHzaYd-0XDMNHiibiIp?mEV+xt~bokS+Ear zvcg4XvqdQc$THr%@vSP+a#7oH8S=d;$x;7scbZ-Mus17eyEmoLHaDfKWUgHa^G0u} zA$0llhM2>5Uw4B~`j@?*{K3tK4YCLcHtem}W@M{g@nz(*#-#5~NXyi#EFHPPnL(=6 zGofMZ=;=9lBzB*3*LHh3ziiCSL(Y7xnhsm&e&4CC^EDdRcvYE{?|jOKb^YAKvaSx} zd#8SV&1YaPv30P97SBPg~(A&k0ut;AFp3~ezO@s!LM6(RFm|XTSEVzit$wMU5i>FFH zVE4dj$j3Z^^t$!*rVy4U3g zJ0U`ZJYv?S$MKBzqAT=C(OOndt`@DaX=jHsqY+UoO>MZFG{15&^GPbR<>)*>?!{6DxP&;u$g;ro%M6TwGL{@EJt5 z95`X5Yf-9R3)O6jX9F9jWACj|O!$&GQ>t$J?7mtt;MXD457@6fOcXN5rA!;TUjOVs z^7uK4q~!5H>4QCsXV|-JzKY%sQWi%*1mMn?YmUgrl~NjJ!IYrb!yfsoWB@ET-eP{MK;s*;vh%i z)kd#dUQ|S{;+)SmVx1+)l{>Z%lMA>|Tzk#A`uXZ}ZkOsh40)x^uvo{L@T4R+*{Y&C$KL zkZ8*A)@T6R?S1x*=D^d$opT<`8lK}DYNa46_S+w2ug=gGl?ye%!k`}qqPk0*3n@iT zF%OL>Dg-3q9PMOD-}2J}%7-i@KVd_^ zH!wO%P5QWFm>j;Y1sFjoCkaY^p-3_jXDd-Hfu*45q)(z0$0pk9ds`Zx2IT9^|1g_C zC+f)>zcXb(wztb+HeS-v_U1SI9fNTR?E4zT#F=3#J6Wdm><47&w^ZQDa!5|1So)Kr ze$9w)b9)iz&~^Ti%lO+9+pol0qdP*87RJkj=DaY!>2BpH<9DOfN?y=ONvlOhdvs$g zj_%{Q=;*gRSLwPHt<%j=h(!#ZrSTlyhgayjP-nm^$mqnRf32x3mKXw{AV9BaH8{Te zS~DWX`(kkt08xW2ekAvVnRG6k%t4L!wm*LSFlk8${$bluUcE8XRAo$la+O$|82fGD zkDM#_0oidA=-z?g#rt&0-pNSwXy8leB1VFV%***$l1&t14w&R|(zKWSx+e)GIA z43*dXbJ_32+24afCt*L))Pa$YV#NxsJ4r>vA#(?@Ms$ktP%clE`_$+gY~P`?Yam{E z7BX#>`5zDgx*FzKAwkb1ASWt+50+n2EZljUuhINrWj?`anyxRZ1a-xQH(4UN%fp2h zpXI-H#Nkoci@ih8vVHw!PCL=jae`Qatpwnap80C@chAMEJn^Gnh&8qdExaD_C)bdF z@#KqE%d&a~2osU!=So8`X z_`mPEx~h-AdFGS}{8g?87uw+K1mE6J0*VGg(mMJb9eZwP1n`aci=&K=X8-RcpzGc^#lWlu1B4dW!pXe*nSi1BYwMU7r6%7h-!M^B^m58UX z>=v@C`0VP%-ygD><4OS!)Z=4z){x;9DM-HaoO1!8nEVNFxZKEWatjfpRw5wi3%L0) zU)&(p9~o^NS~r$(^A1NMWKvs_MucFW%K9z$H1+V& ziaaVT9EWOFju{2~AE`%9A1>mNaMTRE+|__5#q8{~-uq=aT)@3@B^smcDlw7r2Ms7r ztZJ>O-X0@XuS?_SlGxe5JO7?6TDpNyo$pC<#v{;`(GkjJx@{Swo_R*CEBqGb?3x?> zYRnMU@9uCG&NP@#$`8c=4D#e6nTiGJbRu*LN%pWsEi{PO-m{4#-h zcH!Z(sHq#bvM*eo`SWOLznwAVEEX;NMm7t~?>0G9=k>!33NaVl8h*1FdT&I)bXysR z>XS16h!wN5`9p`m5w@uz1Aw%hogJ7Lbh1hq@kgTn=8ZgO4{--B65voUB3hU%ff#j9 z(k1!|7p`eW59-VWvo6B=2+D8xWXB%%1zF(ibHN>#G-RSk}ekD*qq$b`?+oB`9>0!pl?$Zi|)yB;= zxItRN|HK;Mgb3Nek6Xq<)ZZL&AA@Lw*?KX(zU{;;&qTrV&5H$T6ms_i`X$ZBU)5h6 z_|G1O^CU{GK9WBE<|d69hA~F`lYY{^8#EcV-;Dygd$j8*Co-SJozvUwlu^6hyI-GS zDdaNaU;cm~L&ekml$@+&`4hp<2%wbO(%H!p1UrKZm{l+9E4FI(k62@Nj^lQYsxvlZ zRTY|LX#L)Dq?A&HgdZq-cgC!xXwrujGX|Hs8@v9v-7dMz@w4zMQgkLO73Nb`1#x*1 zlU`uJclkYwvywr#^lH7k_D`957pY?`Vv`QqO>%>yuAF5;XGHm(+Uo4h}_QZqiMc(3u|75&Vkmslcxq0`lNg(^li zN*$NmYis9OxSz)!f5)7H=z3K4N=q3z#>)nmm4H__Z`yOfrf&PCsB7g}_2aQ|%&A}Y z137DY>X=%ysHw;E@#oKs6#krh%pcBUinR7gm!x_aHbXx%l>vK9Dx?2KM=_y|80-z| zq%+3<(veNy=dQ(r6TmQiaoTVD=JL@_J=fQrwY%_^Jv(x@!5Yb@&+$8e=-123&>NLH z6}jUcR)tZm*jvu9J88yDt|e(d=n$dSdGiyEKmiq~&pd*r^Wfm1c!`FZs_I9nHRI^K zC^oSgD<#Q~)tS6|b^u5v=C<2xxm?NDpT%h3w!0?*USU0Zd*{J>jzDlQVMbPo?!IK_ z7_H&stpiT2Gae`l9o*My^Z@3@&<*FV6x89q)BB?NMK($f{q3-wHtrE*+Kb-@jdE=W~?Hg^B72?fDtFNulY6EdmB@Po^@7)}Q7fep00ma{1k7nvO*M~!Al3o58h#QP2j*WrFtlI#|?wn7GBRYT9|C_6BM`Q z4+K!D7<6FbJYAOPaoZg+Zo+c?CZ};vGg9gmbixH@j2EtLl_nfS{lmkOxhh!s>Nxr8 z>@*%G9i4wGLW(@~Y@_RT*V9VGEePPOd7_VJLJo)v-=OP;Z~J|v;dFJO-Xbatc~heq z#lln}j5DypWfBr?P)IW!D;JvoLb)V$)Amc$t1vR1DF0d03>Gtf z8UFgk$6J}V@-tO^1q=ue1E$`9|L4Wb4BNSLRpeHU{R#%S| zK=Q|w9-YD+q@V{|5tv&6YxQ}XqL!A-SJKfAz`&l>enkL4DLoU(oe2{JMVB+ovnnJXe;CQF6&%kX+Z%cWb2 z+xx*28zyENewz%EU|Em%iCmPYblnW}Qp#VAEE1fO)V4-x!%>HI;I3xk?T^c-Ry+4k zC$L1CoYrWZQo*S%=0^$zJ zV96~m|%`7rf&Qb>Qk|O$u=fD7^*)%;zi7|6wToClGHGwAo0GQIg`4|zV zqUC}JM4E#Cl*8q+TTOX&{OzO9C#}ynnzY1>Z0EwQ*p-@echAk(>*}62R=UaV8le^`9!L?xf+g-xXbxy_97chKm$cSmYl$@%!H$TlT z_?DDIE3#IsZW5uaFuxFaLKO*`&^}huxnh^>3k)c3eD#^?7%- zT54yz`XYJl97Xxb;4xz2wukfsOo7U?BhNi?g!!c7GS zklNvVwxdc!MC8DiDR6%_2?Z;qD`wR}bkP$@Ns_X|P5VQV?Yapik|*?KXjW8OuVuEP z-$>#@dJz}Ei9Fv};}<*v{0eb!aGa^LXDXpnh5XuXX-hf<_UO6U-{<2-y9|TQSL}I+ zDLdz8a`VftQgWj3Qrv8)x9;;2R=+lnrh4ZPCb^>~oJ$=+nF~)4L{iA^*Jj}&EI`G zXVWq$BsKiil$>fU>R%g1Hmb=K07aL!3qq0t6Vub ziG)m2?Omq5yVJ0@jVDtTgJ`1Q9AN*bRksj)$NAE1^k=z<74frX*01Dn_#o(7@#dFvP);ad%+@ARCb0-x3i(}K zeKgA6S9V(-wYg{eRWuq|{XQ7$eL$C$XSGe*YOXKvKUEiqKL+$GSP>(mHQ+`5a##A_ zO^m{ACZXV|_s;S;r2zU!kQLl)9|J9VH*udgkRPax?iszK3TQM?zP}C2=QCUnSD3KV zUNfYqn2l_{X4XAAn;SP7-Fw}(|GHyvNDL3BwUj^+p8X{gl@Zk447fG6!R5tDwR{5ZvNf0 zcL^s|W9Di?+Cg$rzq#p@S@&eZ$MblDWv?xG=3HYjKMRpKB?;6sH=NxvTR(P2ATeCI z?b}NWK=L1`Tb;+xm#Wz8mp1s-fe#%(yA6PGLAB1koTH12N!xgw0`q^*Me&|Z7!pD8 zub*ab8z63`0`lxvQkS{C8!GENQ$LARd}r%DGCPDQEF%;t@sa#a4X)+r?FPqC9={L@ zrq`OZ?IeDd-n#@|T9F|g>Lbk6A+N1T-bgK95>p#%#6OnYXdRmu2g?;;44bOdHf?^y^(u1-JUBBm!|a?u>( z6j2Wrcr#K;S%<#_2>z0mFVAj+bHZz%>gR_D!Q4Ld3>^iwf+LRZBE@NTNk{e~Ey>6M>;OD}&}&z0`l-eUvKF?~zf&fvwUEeu+0Nfb=Dg@y0kbqp9wg#qOz;gu zPuQ|-zPxzi6r(R|Xb?j&-+v&_9}sPSKP4kQe?(55$VZh9|0F%pJzn@bYFB5?lP&j` z^avWpkI6oa$RM?I%h@qaX*B!|@ivA1;qh)t85N3G5Ej&U`2ggy&CJBrh~KEtCk=h( zjsawYczUf!G+o5ciJ17gI$qSH;7tdT_ zP2R02^Ilt^{>mL-5;KxH`ktSSh?*3hh`Bxv63|Aq6$Z+DsJEW5W$Pfhi=Efi>*~4F z%oQ#j9KCaOFD5flE9WBXjuqrHLJ=vnmb4YSXre0dob8>cu^^;Q9 z)g=a$rN6e}*Ku6&VKXJ*mzF6|O`cmZTOcr%A)t{+lr7tcX*)2&N?xewfn$9R2mJY= zmh`>nT?E(Jght)Hy~_W$Y6GVehe%<_oWf%=ra)RqB3Clc9r z)D+2JIr0p=tq-)K6maX4D##VsDJade*cVE#(`hl&Etu1JIMXR#rB6w4 zq&>GV+sYXk5YIh|t8Q25${H^#o|OhduU%>dV7(nBmK#fDACHv?ZmGYz;6#(irP;NM z3A^V_xb_ZVS%WwN%%xGG()t+Ov8@MGrOxxIF_}z%r#dxb>2k(0VJ0m9phoht4HU*~ z0b^3)YOQ|67!>U@VVz5#dX^&1DvzcnWfWX+oF_jY%_WIuDT|h_%xnBq&k4i0r(Usr zL(zO~3KOiB6DIp^*x}h`$Aa@?s!W z5f~rykYEI1fw&!T;-mq(3e?Y?hDLfMH(ot&wzyB}(@TB|t8QO#d4CC{7P=LKOz z4OiC2O zKZ8-me@w3J`cf0o!gQV<*W4{)(ZZnnPm)jy(0LN|3%5NtvTC24`1HL5!A+OjE+TLB zu>aSn|HOr-;PV>N{1F4TSxKYvQ@?#-`NeP$1wFTYN?YvBiGVPLssi7w3A(*xl$5gm&(s(k@Us04W6vpmAk#^T z4*RSpAhvHm(e;YGOkm+$_qug^XdS3BWj@cHKtBn0qCet$R91u$`l?tT zNYUft#Q?zvm>7Yr20UeHl(PzecP)rz7BK_?-@uDo8eH_;OAdjINeK$HP!A@W%jsx3 z4_VlyCPugF5L|o(2FKknnyh!T<@(-doTK?)NNz_B-Ou&=Nk|o5Mvig!L~mvibgfq8rj#3_LQNwE%pe@*NrMuSO8Gc|U)%Gr|9yyl0fQJ2s7FEvazM2fMb8ADRp3jvghp6DEzYEh1SZp#{Ly21 zOoFMGx`eYcJG4qW_Pl*80xd7N!<;%02+$hYmQJLPan{p;20eic7|8K|51q?^eNILH+Pfa*AoJZmycdn9E{(Ij zMc3j=WPLuKy*d91km*q4tdKo5C?&0!4xTOSj@?9U44LIbbhJ{LNn9+-gSb}8R=^Ck z=<4UZF6kEAd;2RVD|2hyeVJZk+*6}!{DHCL3cPsP;&^pc>ViDva!`WrqV`gWk7`QvGrz?DaT((}el#r>-8L{+D8G;aQXvRCV zgdt|5rdsQCn3un2=tdXIM^dEjHbYFuy%M%SyEOuLd??)u8KZsVyR4RCSY=&?m2!;c zW$I=@vt}~}g9y@!C(TR<1`zCQz)Zurs~XuSDPt)V$wj@FnS1k`^QQ;@$8RfT1cj^o za%HEi+tuH4;AtR&7`j@FU*-rkjvb4!v7Z!r4a_CJb3QGmY@;oVN`9->BJgmv;Vy7r zN(N|CT(Lik6DJg=2pok)+(j{ep zq;!dhA{~N+gbIR$EvSToA|g^sN|%TzigZZ{NFyzI$MXAq=brD}bDrni^Ur-Qe;BaY zd#!i9@0??fIp&-sNPLKK?3SSAcsz~*u*!32IGjcD5VQ?nubaoe=ZL$nqAYuN7Z z?q6$V@?bxlh^X%KilH{Mlv&jFw6_S$Mo#s*@@b7AGo$|KaKBEs+k$=w+ppY*z);f($8obgokw!LTWD!zTb>ubmI zH9Fa+Sy_iC7rydQ6eFBbEPyG3HYSLxbE2QAEw zZ*MR8 z7}^-XN#oofS5@7d+1%W0De{OId?c3B^}7)K8}v$t*l3iMSN6Lb@JIM-G<2Z3LDI4q z%dM*NUwu_%4`sYXG5G3=B{Y;btNOTPWMVKGPJXHS9&d&LKX4NSc}&)FmluSXLjxNc zPEAcuuN&*^?8Hb6l91o(vV1<2P_JH5`7bYDo>^RtszqHA*@urGcgf1iel^q84J_~B zZqwr&jlgdaa2vTJpXlY){Afdw4cVEs3=u6 zwKd}aD6Bdh#1PjVNgkdl+~Mr(eD?hLjcvw8Mws+;G_hVwQ z;V$7h%F4>*^Fgs@H?7%Q;*}>7M1IUZL|u}W)=Dg`Pufux6^a`-R=UdO3U@%-fA-v5 zAkXUi74T{RADB7`ZPyl_>Jv}ApB`;y`^~`!Hn%%oX`CP88a|8JJqPk+$1q*4wcYZ7%kn+IN?w#k!+&tyqW||I*z10Mk;1iNhrA zGgZxN0zLhe-_CoA1UE`VYBZUc85^q-UF`EtHqZAvOk)db$-iM!= z8*S~yWV*7E*$~)*y6@iwPVIX@syO&Xnj05$OgS^e_zud1@MA{)%A zh}Yz~4u)_@x#M*&U%zIAsltp|Lvnh*-!{2%f|Q38S3gvdKT)(_Qc{w< zdEwJ}S?*fdB|D<)F%?$;Y(v1yv@{lU#9;DYs1)?&vazutUnyQ&nFz&{4ZYWfZ&h%! zM~?;(AB8?ePd5B{^2C$tbYQ`gtCgDgbHkrLsnvb`%4s4HNxTv`ioBR1!Q*#-q_V22 zYBw8Zk-f$C*<6w4JG&oy1{N#+#S0aTxjI7XjrAVif>VBFb!}R1jhna)W&xAExZ(VK zW8#J}GKESW<}CQl8xaf9DBK|NU1A9f3Z$L8OUTfFNXgSP&)VLe3DeZc={2N(1rT#7 zO!{znjF=C1W1+3NXb;{oDd{-7xRX4_N6Rtz=MKGyB7P4uv9OsS+PBDX?o+w*;KA+y z`m!?_!Pp7ER+lC=%d>{75K&yn`1p9qD)G7KN)~G0o{e4o^3|)Yu<`Nf>6)Zo4Zp>E zFH=%z!iI=`a=FV>HI?UOr?=sLQ&Li}6QqS+=qBrJsH{>IEaV~X69zEg^En3EAVuPi zaC>BwSw;uGU!FdREx051J=$UM64RJ};T$UCL7Y4pj^WM>VBa!G6-dMGhv!+hLIC(b z%fYl?54#BiD;SA`hAU?F-@A7&J^oX6?kz4BHD+`bC`zlub5+vSec+Ved8Js{{4!<= z>K+mk+nD+*?~9K*CW9#~qc((Dfl|xdTo98qxEu<8sgMO@8yOQbD;B$O{D@-{pPOrd ztJ2W8MLcv=*Rc2>Q&VFMqT9)gsTvBDYnL;zpjrk7d)>#{n1UYxCftdRW`Fo>I~yA| zhm6K zz{=H?r45|%@$nVoo%zb=&!6vRLvotE(d4_>lk0sZzFj`^H}I@J=3T!2xgR5+{D~*i zo>%bd)rk(D`A!#sCr4a8ZCu_gOzc*z7{T*;xpT6ZP2JUVXOBpve&kMuvIb72dW&BR z8WW6buI(czb@dMs!o5Y0W@^dsbYjjQrlqeSn*;nw)o+RC!U5!-;F4CBE5K<^5{ z)MHHMCa<@ob*<^vP4#S6$Yf_I>)2H^*O*^*C~^&Wo)OL3({k}gc5S+n*I_3bijJXGS8PG+Usfp=jCB3|<8%Mfo9tVX^-`$v^De)h zs5vMi^2cVs5BUUcL}!ZYF-{ceg1@x@ra`J`tPwWlH*2KMfQcX{M?&5G^*fpC!(Q2<&2Hl(O+xr`50JQ z`_^yy$s>!?%-1=T=ry;PeCiSHd$9na1L-j9)?p40H zP;~s&o|A`cw5JcUX+4Y2-F`Se-a>Z^%bdOcwsYfMJNwjF&$V19Q}3SkEgVSbIG&_-FBX5`3vxZNYb1*@M-@K@xeo6J%F?mx{4m28kHZhwnPMh#E zN|CQ+r72#}y_kuex%s3bpT~j0tmx*c7ZV%P`k4zNrj2N4zGf}g{us~~h&f{ECcIqu ziBhvvH0adqXL8mP4o3`)X>Li%s6Eh)*i<`9v2e6m;rsK@D@95-Mvg99hSXkHHeuMW_P5t54GUmB6gI)gd)eNgMm9e>Q?>ABj9{dqYzT>-@riU%RMn`h> z6_X2PDQP|yukFpBP-^VtrucGzg7L%$U;Q(-YZy~u-!)%aTIvD1o@~;n);4cLTL0Wc z)xxj7xKsO_k!HQPK2-G*4-XGXKB!ARDPxYdhvEFO6AvSKls~E7WgFl*zGeDNt_THN6#5b&td~%8#c6mL1)gSIZLH!=<^fv6D~bcL$%^(BtK`TGi6_k)qdW=7Nt_ z3f1ZbJ}7CI7Vz+$F}vF!#CtsCVdVNecFiS2GxZm8cgZw6JX@I(!rauoj~+gt%||{R zM^i|yagFRF1-CSH_?$+Y?d{UDx63uh>c-=oYY#m+Per0`r26UTeb|^00IZj7?p&uv60>3SH7VlHONB)4##qq%XE54dERQCBoR`&WfEuq8SuW)6> z`jrW=adcqwNd5pK#!+tHURP8o5nP81(~d2FzR3KwX%?;I2vTLoY+k36r`q9^pv$tDUXJj-~FSjMQt2)Na9%=jjr|_YO~=HSMO`lzAZI+dAC%tb99q8$6mGS2R92}u#Nn% zZt#Ap)zubpj&zunXX@#sZDtnRKAGE3n3qk3UtRc;w8x(6!PZxrOo!IFxo_C}R&Dd6 zCOVOD$99<{=~JW1jY_(04vZ_t&2u3sl-qN*%O>$Tgd8&}!t;Z+!E$ABR)*vH^9Pvh z&@D*fIq)BD^7x*6N1oNmr{$b7MBUkSG(~S7diRcOc5+5oa)xzkMp&Ww?1#1(j&+LX zT8ldJ2PxRjMkM;5+%85VTwOise=n*>W!+aznnS1BsC2Af+D!SW=D$g$5`Omabn0vW zq<3rskJ%o+niU!uv3O;htMl+~soiZwrqdV2BWvd#Z#}n7jfUzWn^DV8+Tx!^)&-6& zSxn<}N@E8EKgf(TCP*JxGxE|jjBPv5G0Y-Vu*2o#y|Viz8}EIU@9Eq4sCGhqg3s3C zRrOlo#W{u-HB4i&=6v;^A8$&iU9Xs6YsIM1rSkLK7aQ-&iKGc#=NoNJGr|%7@t`#{ zVuY@}PAuB@WE+^+B+YA2ei+u1YKCktF0Lnk-=|#XJ|nZ%a)!y=de>J>BAG7S_M+>q zKW5&HyM6DU^nhNsUr1N4#6*`&NB2R3yl!&N%+qgTB^{@)SMLoft6%LIKX;GXh3?>| zaC?@QjOtz|AB9DT)pYC)%G+Z7+BR>JK|J%lh*Cx>HA&OEb8}$Oc!8LR#r9g>c{`Cz z4L;8KLy9bBboRwkjER?6G=7 zJ$5i>dF6o4LoQprOCsAJEAgKd+Q!e2`mD*>JK_C|sdKgFI#wH}*|p6_o|(~gH^o>U z8WL`N&hkLpUZFTc*L~?=lV_=QgJEp%;Fvw1`tOA!FKmuld~6#}dV5(m=4t4ml7sab z8%>R+eZsyfzZ2K5pNG#XRD<4tRPyCZ+R;8vn-I_A+7$&nNfzpae@IFmx+ocN{dzA5 zY?6PW_xzb3AMQAbzCDSRu%9(*%fBRuSis)=OSD9NOx&ddbb=*PEmJP>LpqB_XR!@z z^0|EPgdV8er)9oL%aqnt`4M)2+xcK~HyX9WS@hSv#tQwSPyhPZNXx7Wv;Xk=7#5C-RTitqW?F zXRtH>ko4}=8^-iQ`#!iCeJFI({CSykdhW_6$>$nyguf+eM{KpzZERQ5A~GK&7m^Wo zjkfxvRdV@nW&dz24IHRT_ebE!pg@T+UIk0z`3rdpLz#AQp(5NBr}4GC-|n4g z&@ClAsWP7$Q@@bDe#t4r4X!5-G?cE8vcBszY^6D@xpCWR5rYuzCcnVy4Bk5(^%+7| zBH|;?Jl9&SQC}l^VKy{-;)IoG>2cwgZButye4AZQT(~azA^5BF19#iuP#KGtdL57W zIW)b*6oxt}El)0e$5ya&kJ~iI60(CFqBIs{j3YvGwiwU%z>uv=r&>Wn92S;+Dgu^X?+so}Mx^ z`_45NI%ztQludD?`qLgI$xn*98?+0@4{cW9vOMc#~f~z)h+co_Xn;R}w z^4-&-rCbtu9&ebIq{pl2C?fH0%T0!t>*TG9*qA)qL~GNneBzq7mh@_;^|h95h69spbk@mKQiE@VK0}K7&F=w%By%p3WlZPUq4=V$@vj(Pyy=xvT>jY2 z^n*;^?|jiKo2LcjzTB#(QxBG%>Y{YLw(Z=r4!x%~k`nLg&2%_C-kH1Ff8N{_m7d#B z)6it}j%1USs;_sWBm6%s@By;|)}1~xTZptj>D=3G7>-?rPD4WOqXudi8AboFv9_-z6I9rx&(3fpO5q0t4=>#y%mYpoj^C)3R*rsGRT!* zh2T{_P7^(~+q^_TlSy<&=H!TBl;Bl#PI3r+QpC%+!@sR-N}>JUv!&XcnP_s-G8e0m z;dLvjgST_`=T`Do%_f+slCTriJ#we3Ot;SR{D)88bFW&+@;!AJ%J30ZD;+<6qtgYl z>U-4l*k?4Z6YLGp7={IDFnVN~CAuWwO1ZMQ$9T1TJN&0M5ZqWuTnzUy^DUY-=| zIj{|VnTXUecnSZ~(@yVF)%IM&rSDOD=#LhC9v+jTk?(>WB8-iRf$@RN7{u7T7 z&u7Q8$&1!^`TV=J+;V+ev`wgrl4(Wgs%E3@+yCsug~i3}!mV0jmj52(|AB6^57&yu zGY!v+#C5$m8L=j4w0@@4;xWI#tOzzGNqpcTPvAdp^RMG35=icQX@KJ1Y+b=<)cMZP z{(*sAVq!Uo<(zNI|6x4;ggWs>v5}*B4K?f6uh0C?uj06t`4?5}!AB(}C9$+uK9K#x z@&1WRLKeNuTnyO%0?}Kx;dBNKH=Ma4`yU;(3;-7Q;GHozHjgretl9CUzCrOYhdSwQ2bM5-;YuGon zNkyFhX7InhOy2iu3+kAdgdyn`OMC4D?SJt&Y7?ZUrly`YkJy`*6r~}7O-9=H;1%9? z@jv?wFIBTxIlmoj;V;VDGX}qLtgfyK6kgH3ha^;Vbo77qIruG`Lu?C6OP4&P)PHu? zIGu6%6LS1->-29+fA-7nf8Ee7mu;b{f3G0%?14_5pFe{Z2XH3lYszkXNbuM7X~ zQP2ImA7teI{YFZ77B~KWC#=!dzuyU)@xS%S$2*XbaE6i-*oqE?P-y7#9yqGK&0Jn#S1`k=M}c~~D7CMz?^o+XtfF?Tdan(xI2NyDZ{|+?{`#$Fs|kqa zvBBTb60RMre6zb_KGz`RQQUJcYeCYfTfTXNMqxyLYGjUM*L%zZa7A8EK0%vHNQfS3 zH4k)O%jdKi9+QlUM2g1_*~@#?PCviK2}oZX@2jgP)2x}B=(>fxDN>F6Cb{&IF|<#7 z7x@VMWZRtD6Ubs_?Ya}QYRe#ZrVdoN(o-JPhx_lOmzX_=3qi*kf^?mnnkt(as=xF2 zameDL7m`IiV#U8{P)~6kA1eQ>?qdBmEywPgBmGEbI*+ypt35un2~AT1wUd<4OGVVK z#s8M`wIhLU_(Vse!a3aB_P1bcl4J!o=U6m=9&d%=Cg3Nsa0`Ji-D&yjib_arJ zQ+naq4db<{yDs6b&C`^5$sjp5HT}VUB2o{XEUVMKW zk&OF={*17s-GJ@7FC03z6PHX23xLwp9O3u)84TID^?tvDR z=t{dT%P4$u(g}HAMkq88I~ZW#E!+0(-jZs=imwaN=$Rqbckx02hT>6f+H}OjL);hQ zw?0gCErs$e-EE|4$IbhT?}*I=%i{6l&e86^zWVx}T&7UftyU?wvro~ayn1Dm5rU}j z-7Z0m7|Fo<_ZdI*yTAl;2?(UtJ8O4+(`{D84kAJXY=iQ2i}Fcy(L??#S0_WetLJ7x zN4mwjqU9;o0!*Z)q@r^14r`ddI6DyuDG5?yN?20ZK$Guuvj@gM)V!$K>CsIV)Zdh? zlc=7@wr5t@)U9wdMX?w008fo%c73aOZIqZJGNkW^7Bu2he~cGMyA3m9C^ya9)-ZIi zQ+p^>YL5@DYiw4fqod>D<$ac&9Z`H=J0Io@M`|63xJ}M`)plt|u2GPYFq9tu8I3)o z*zkCKVda|Og#fdSKDt^s*T!bzB3{Dhb$Kg=lH<@W^ zaX~N~n3$OO;mentEEXoF8?RsU1G?1>`xX`yC?VW#UVpMMHh*fWAxWp~%NGp~Z|_iO z)f_@%z4l+~?}uHD8TW&hQz7XTm7JWMmX1z6Q=)vKrv3^$i$XATM`ZHPU%YTX@}nUt zZ+g(f6@v~M?lOB0fm1McE*Es6uwVHLgiD^6LgOdcKPf1xD6t?!SAFSvNUa`!OxB~!r( zv`qh1zH?P`nMfuQ*F`v0QiVU+Iwc1O2eH^KvlQ@P%D*HRUL&&OW3(riC_tWW{pmq z8xee(QutWUTm?(r$c9;}+ zWJJ|o^@^$;NI1wZpmp9&RM6a`XM?cSN5;A|b2(yh^^%#;^1L@8(rhbAA385GH?-%D zco8mJ*mv0@uFg6DuefpJ#`3x?d7n%@B`%GdkjX>7@ivruTtb8^z-juJ)Zfr`m|Hwf z?Ee;t{{8xYV&MN7)a5z8k(S+GZBO~*!?qo7ilIk1vgck89|{L8;xE-&;1JruQPGxHkSO}M#z4?eCOaW4Ua#LK)6 zT^~62`Eek8e*t`|&?7Y&4qALjS`XkVJub>E3z}Sd!s7yIwARPv#+05e0Tv{tWf5{} z84mS)JR*eg7nhvPFaJ`Oo-A3{l6ipvB*(Y0D&AeYhQnK2iKxJ<5F(uIcuR=O$A>gK z#(``hyrJB@d1Oda@+{Coa9G&rTwm>@gBy<=Ia2uM>oFyz2A#iRkd2r@k->BZ%|)#~ zoE2$Le)_Z|+YA6xFU2Sjmb0{`W-}KTmj^D$d3D({$Vb4uh#iOodOVN*2m17IP7h8H zFYEzGkO@<<%s-R-`n8g__EzMUno>>M2NnmvJgEXZMyMFw>FRBUSV7kqphFW5?Nhzv zp25K&pr^BE&x((hOY*-iD%w%J*qBDYZQCZiF>tyi{4z4VO{PVcC3bj^3X@7KZ^@jS z_su!CI5PyfA2|scn$qmggs|H{_UI!u0<50~?ZGPL&VPt4+&=Z$$B!qL`vv`u;HseK z>mC}qk)AHJ{PU=sf`db9y(JY_I`+R>j3m=v!3z*msg8zBAVHI)e@eU`8|$#-H=X-> ztaiY}Wp^T^nZE#I&tP2h3&XJc?37Z$;+GAI8Q`>Z9(O|`0GO{ z_61aw{wNoKQnNahvU_lFBME9-6(|Tm#tHi@UL-;nG-zmf=e8en8m_?4@$A_nIqKc` zN>?1xP@=9AR@)LNnOHzXAwf_EWwSLyZfXhjCeadFoHYVy98wcl?>=HF`f z9@h`hq6eRB30iCN*Dvf^GIYTUgJ5izh{#qvnD;o0&ix;m2n(O>V?5YzqrTG1QY?>& zxoC(i3rBSZFonQL2p(1k{Z^|iRzR;6AWPbkWtxU*F?vpWqoe_?H-7cSJ}Cv(6o4LfINPhkx`fZslL8`ZViH9BtI=a z+~(p~r3O%tIPB0N9yFV>;ZsNmRghKFE(5UE+jqfN?E1q*y)1`3l$6=A{X<)5#XB>d z+H5p#r=+Gy!lgtX*K0B;FE7vMd0+eE z{;gZL6r+NI`kPJeRsTqTotjEZie}Eh5q#PA$05bjzWx5G>z!N8&=dlIwv^@u92KF9 zKQ;43?^OpNP)-aCJpaUp8J)sC)>h^-Jbs-ayk&OT5pOK^x^%{1qkSfsZqy^5L^$Mu z@z3o(b|o%FckbMwwI_{UuW?I=h?UGa_n8e;(PgYEZQ&hJ`?j<5S%;f2d*agZu$bzk zn{Svmr~K>jOAto<1K$WkW%TSa!E2#YE@Hb)g=Q1!%Xl)RXgQ8i99CL;%)(ZN{{4N} zw!U;Vfyh4etFHu@ow%fr1}K#P(HvB&Vh$bhW@fRbQCL!yFxD%8;O#;sO&4b_OWxDru=>2={In*# zzE;&==t~Xn>hK2@0QOe!BcQ16Ut(fOqAv94iGGc?(yUM5YLfmfY{dUKe zlh5shSSO>pmMKW-Kr@LMcvuCPiS0qu0wJI!qS?+3FJPHSCEyi|P1CY0iTrEb(Vdl` z8z#ZFFey+5josQQH=en7DoAd*)M3Hc%4+v2q>~DJ?pdInYxZ2z_leZ4OiY1LJ?4K` zfGpM^&t-R*%CqvtFVfT0|B77OzrK0VZs@(9j6?28?T=GO4P8qgZzl|n#9szS9Zr+l z!oq@pKPa>kI$c7-#Y&^S28=-^xCa(AjqbbUx6DnXhK}Fc6ZiBfiHN#Yo|QsA%o!=Q z%3(Y8qZ-bs3eE+dC&OpS-K6O9HaeLZdbql-GtvMxU@JudTfX})*;8_*U#Nc}8ua0F zv;+g#MB=au`u+;lYqlR_CHjUt{hqQSfZf7mIUVHSBt%Rm`R+7{>M=xFokK@%kI*d* z)Dx)OWWOMlgQ$QJ6$nI3L*VRoFOZrrMZ~EFLFBOgP2+O)h4C4XkUdarmp>G?oW%P+ z%gES*557WA4$MwaXQNH#SiNuc^%;0RU`V^s{M`T6ExJ^*!VrLk0!&RI95fl};JHsF zO)16@yI=kkBgL}s0TtdM7e6)*t7?oMeu=>Wpv-n+Aos{j5%xEwKXOI8Y)VnvNO%Pd za3%LAaRTYs*Mx?KR)X)<*}}xM`~Hapg!WqjT+=c7;bKcO#*HBSi;CL6x;!Ws%B6Un z5_^w$C826XvyaiHuD>0teY1mTR3)J&Cj|%X@H)myeO>g6CB_*j#AyaY#H!ArnARMs zZ|12Swp~meVO^YFa^=$_*DAo#?3G>UUT1NL&3@_Fi4$8%5QoQj{HkP8h6g2LAk0`` zLWiLj0KK%%<1HDAU-_0E(R1C`AuO3=9mn@6^2~!N47DO$XV+dn;MY8DUV$VAZ%( zNV@>%EY?zgoT`tu8|j#yo+kV@$Xb`hSO9&ks_FozZ)s7475N+?XnvCbfO2v>fl^1C zEP)xefqbh#1Y@T-`L2zipJQXkFH&;MMic&x;GV#n&~tEf*7xAuW~rk8h56BPp%(*o ztDsW94#pyuHpuH3>P3LLLqYnyNyqW|q4L*@7Xt{s43Jn6IO@sPrlb9XgMElPBtX69 z#nJ}C`088yNiZx1z~w?4{UWja>5fB435Rh=Wh1$UV8cy(@ZY5!T>pz?#{Vpk!?ogS z|6AFT|9?JUhYW|jD_G_fWW|3tiCZN2w8-OiiBmhVf-6(2jg63`Zz11Q%| zm-v$*>Pt5BTXCb_CbAx_hMtta0XV`oTPL^NLr0c**$m0i^R>~XK*CNVO$VUM$4Mdh zB%E*LZmmO?>}&vh_ww@-6%p;@3)53mNR)7O6!~nYxJCxmEW*NacqW2SBoQNqZX>*D zS3$BE3ZnW%jiaDkI`a(q9-JW33!feeKNNS~^L21DF{*Zv9H5Sml{#V&%!IJj&_WoS&G8iO!dV2qsmX=uB ze(%1eDbEQg0boio6#RllkOHxBw(@>{RW%n)^QeXe}nd6=3#KU9}Fe!mCt?#j~y2%mx@Zm#&;1ll% zRUnKoBp_sfoJdq`Xs~g^y}p7=>_MmoYOo3>)k~6{;Bf8=F-J;r@$zmQ8X9^i>9!BJ z+akwqVY26;h!qu5K}MY@=d;Y2x7(^vVkcS-s&PstqQeRktUuJ$l;7R03aeyqdn+VF z0ZqCQ!S+C&u)9hYV_T5zAzY7fJF1Hae@Ds1-sHIW2mxIm>aGbkVAP8(Fd(3%DfH$| z71Wv6#>$Lnn%EQ-H$_WbqJ_V;$$Hn=t1U0@-zf`S(+zAS;*sQEns8VB5*K?B$u~q= z^ho4#d~d3%s*0=9*Z6s3)QXFdzq4zNf5Mky+Y z_EEC(w0?+W;NVfbIJ{pP%%8j8SkuVtLh;&?91(aTC#T+*PWy{Z#_JG#wU(aVb>&dz zQk-H7>pCPL?e^b~5qBmBf698unw|((;bhunlTkzII;f63U0HoSgXR0MD~XA`NC5^R z8?`QdhPkcSPp?o^7r6$=>|S!(ewjs}cfg0~_F3HQK+7e+Rhg5JH81R9ny zg@(nbvk55(tOqI{52SM6tNscq5YVT!pVijZ_73b5Hph|V#pTRDtclsM3rU;r?I+ReJf z^sjg=6sgxz>qBMj?b|alG9F4@atLjQ!7_nj-*w>A1?MNolqy+Ra9#HCxpV)1fYvjW z)(ENX%=+5eS5Xy4m`t{C3IL_Oe0)dL)oH;>z^58GUZ>&Fhf27Ca46~NcmVbAP~ zrkjv-kd&SQubVEF?H1g0PeYcdho$9M!k#+{Tw1icXiFe|lE3HlfCZ2@y#gmU_qs%l zCo1T?0UJhWpl~{~AI&$-I|C39^XNiCLWC}S;|as)6QUIyu;ML%-?pYN5Gd=L`&i}A z83dgaFnqHfG}lpNfdNmQnoTcRAPP7zP=`mC0d5KAp+l6harnvFGB2`wzdmr^LZtOR zq8wD%fNN!fJk%u>9aV#Bq$!z~dJfm}lwfUo8we@M(c4M@F3#QbfD> z`Kb}85s8+C*BDw6WSZ~ZgUvz&b0-?`e?BL2<-spfXT_>q)38 zCMLuo$9xrr5(zMLgP><&IiK@n3%5!)7VZe@teyu=BD&b3_2rCnwKCj}eyYh46BCml z2M_EU0zVUs8BQSB2b7vAA@X^l@q{)-FSM=A2r3Uo&=uvdQ0tIx$1%E$NKE|FIQyQH zkI%MuA#O%aV9dcOCeL$v4-N^zqrj69LLGzj1ynWrYbj&riJ#X_% z<@{Eqp)m)^V6|T$26hTXJ~o$LD_*BQf(K7*6a>AABljaBdI2D>sV6GpNnl%PVnc(1 zY?p-jBEtSV4<819_;9SFz>|UCuR(YrSJBS#1@&DZ;GGcFBEgMR5P5T&xuaVY(vENu z?KB=bAUitWF=15aQSd10$kMF=b^q)tE;c25ngq{65Ym&AlO>p})7{f^1+X{{yX?Z1 zen0_&sUs2$#OcFzsK5zERcf4XX2P8gi;j!WAugH(p0orA$&PhslI|$<79)z)5ip)c zVw!LrgdNb=4*~}v+9w?KnIH}Ks7D9GgyuKAb%GxvJS3sHfKO$Yyhw@7!7AnuT|djc zYZutgAXFyg_iDp9Ae(Bnq}fl(>0v$%?K^pGC)9!{Q>53`)g|!c2;KrVHO$m`2+x>R_)I{jc}+}Ga&kGsjZrH415A+(HiwDAOXCh~HgWe+)&P3s zDnO(i!A^}3Ftu5-^@MpKDkAe&Op1Lu&_w`{EI!2^j@o_Xke{EC>*CCXXjz}Op*Kwq z{wZRoJ_qg3nl*$nu9wbl;aa_q`*Zn2VE+i+A$&KOPC2wHfRvu_%!$g>w!$dkq{5!K zPEXFQ66R!3WheR!VgOK>l>nc=5Qs{R)|?fQkO>-)kt0awr%yR&fTc*peuo(U3_Ru7 zDr)2$Q3w}0f>0c)o}MbX)r>Kf+zYotQALWnI(8tlZ0ViS9zqDQ3Act~%6!x524v!4 z(zp*FZghyBgHz6V*NIPs7O5xmd`zpIii?W~3PC7JM9%(d)Hkoh1WdwKBrZvL&+&uJ znuNNh&1KKNeN<@D`Pj6y#HJ(HIa{KQf|eb`s~xs`+~?lAZ%0nRrY5)N_O+}m%?ByO zrJoFQOlTf)#^fWYc&hd2!*1TZUS|z|!h8GS!=E$DQCk&2({BQ-&dzS~nv^&Q3ab6r zMhH;wa7~bDtL(k2wnx##gdGXIYlx$&=7O$agM(FSzib4G_}IylS@tYjwvfSRLJUKT zSp@DfOxw3_hwS8=Xs9LF#VCpm-_2aVj4ZCc;!1YrlkFNmh@nv6MU>%M|I(s0 zfc@(jh2q6sV>;yK&1~=9hmr6i7Ms$S-}s?i3>K?xk1hqjjLd7XL#S#k2WO&r>Qn%- zBqoJkOyH5m9{24L7tif<{Xo4#x+V1~Jv%#E3XrKYDZnN}MiHc4{Yn3g~S>aWuy@H{z;C zi(*CXA1jhkrC$~R>wAl%EowmF!UaC8)93)r_QN)e$THuApe?gybJFe2%T!0hDD#LgqLCE&^p61 z>iIG{EW@0vhn9$>+B1TEOE^trX2_n^S+^HuW;UeeGBPnWrkd$XNJwP!o&X4JA4}s) z(X*ZJzEV;mPcXCRb&ZTdaEA1}e<A9`aY4m3x3vp(|ISmG7h<>IG{NzD~se`yfWsPvg6qSd=^>;R3d&(Cq!Az z&D>Z$B+_UL2Orzmf3pr$=((Lk6<9XzotX!A#@#2)Kw{~;t^Jzb_`}VKS(bkX%f{c^ zH1%wE9KKbv=!Vi%66(`sOTgdQ)uc=xHoVoPck5Gj?v(G`NHZOtjW`ep5IX9~iPC%` zA)M_-_-H~916U5%fE@{IUWSv#G1nD)BOgO9iA)UvDp3JWDkv<(g!V~LxDumjzrA=G3GGM-j& za`VJ%n`fu%pgi zM#9y;I2m;vUj_hm zuzl=CK>PQg83e&*>!q5sj$7F2VT3u!A0n|T3q!!Laqy?^G70;|#8|NWqM5Z|D(Y=C z*lmToGpcsDdVQLkk$po&NlA!r)vc&*Pr8l24ff00_oH&76UG=MvUREDLrWk57C%`fFu`26BlGMVOY3vxCx_#$P zI@}8~C1|eE`uXTLQwnNU#B0IuRmM@(XUiZEXzvv*Dk=i(1OGvZhCun;YTKclA_5yC zwFNOt&W`QYp0x4Ev4Ekuh&dpLKLP1)_V%7ZmdVY!9cDPLpn#oFtbzC)HG@Y-?~J4F z?(R`bzvtQ=2@KV)v(8uq1P17lq)PIeZZEA)o}_^@U^lb>T;CUFB5)M)G>{xj_{d{O1y~-*OL`fA0;zUgv9x`nz(652` z2HVydsqFkYKs?1bO-^ZP_L57=-{yRa5a|-XR6qtHRS@I*79+PnaGr`s|w5Qviy2O=nhj0Zh=$go9o-9{qH`{H6S81p?inMnW_3y2SM zUHYjI!B1Q?g|q>b1`%b$4mmd8-h64h$XQA((pwPjZ#s9ETo@NG^I=H=xj#LO+Z&P49%8VDdLr{hvnYxSIL95vqFT!b`( z2Kfm>Z3xN$kE{x(^G-}mID(O!WPMHJ-o8HP$!Cc7zpJMx@0RkTZ30adE=8LB*4Tk1ag8bwinI~ zn#Ytm>bJDU1u$kogO6R#A$%9V_>-O(j0^E!dn|KvTY>sGZsZ!W8SK&?^vHQ&@*;u~ z*S#SZ_25O(9qD#dlx@_=iDz}Ezdb)rfNdmf>5-g5yb=vY1|*7m(lJT{FS2k}VR9m0 zjnEhY9Ffz_nKwZ?L{Uj;J={ttqLH9PXO*Kz`=B-=TE!uU9fYxa?KBf3FMs~DCh#uM zLlu7N3>^}?l>4WIvbVsChzPUoTCYN|aRzsQc` zjku5C`oP?{(>;V#1PJxw;v^XnlI+?K@{SK_Xt+Eg_Czlw?I6%Rdz1lus5v)E3F#AUyAUL*b)c?%B&By#kyo!*X?Rw zRvnK(m<#5V=rDvvLDzBev7yih%vRreggVdUCRo5T@Hhx{GER~lL>^6o6P9|t6F?Sh zSx;HuW+?64ByZg}5y&}N(vg{XXEx~)^5@`$;^N{6PmVymahT_eztzh&r2Z@khG7Af z2)IyC#;3hcY$Ks%4ks!Ih@_l~%Js1!d+<bzLv5|Qs&h4QgK+XV!Req8Ku-{Ye-t zkj@#6KxhmC zCFqGEqK<5Z-30j4Qau|lH9lIMQnb*!850n?&&w?tM$Y^G3?@1b;D{kyfzB-0#2s`D zEXZkyl}x8r8bL}DU6ep^YUV$=f2=|0yXPPWq>8&FB=Qm!{}cdiviy=Tafk50$W@9X zo;z0?-TEvr0ci*nuyH|>K|@1BkVdKI#W!(7a|UWk<3iY+M5|HoZI@VS`$MprU=WCG zCf0^HvoJ?sC4OGFzVS{SEvaC{N4=Q;q^=~&d%w+pfK-ya(CikJ%R7pNXF-=B$Dj$| z3Xz)P#r)Gz%MHWP^Z!b4E1;BDuM_%jafNfrW|O zeu}K_tz%RmmIx75V)?m0#X66B)5YV}fkb_h8GZ{O1$n@A9w|KH>o}CxN--dY$gh=t z?VSQ1$H8^8B(fRAS0^CXpF@s6TntCP&=3laKVLQ*{tD@cSN%Usn63J$xqaj86XFvg P;p@1vx>Bmb>8t+>OfOh3 literal 0 HcmV?d00001 diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp b/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp index a2df5bac569..3cc674f181f 100644 --- a/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp +++ b/example/ck_tile/50_sparse_attn/sparge_blockmap_inst.cpp @@ -5,6 +5,9 @@ #include "sparge_blockmap_trek.hpp" #include "ck_tile/ops/fmha/block/variants.hpp" +#include +#include +#include #include // ============================================================================ @@ -61,6 +64,9 @@ using bmap_fp16_problem = ck_tile::BlockFmhaPipelineProblem; using bmap_fp16_kernel = ck_tile::SpargeBlockMapKernel; +using kstats_fp16_pipeline = ck_tile::SpargeKStatsPipeline; +using kstats_fp16_kernel = ck_tile::SpargeKStatsKernel; + // ============================================================================ // bf16: D=128, kM0=64, kN0=128 // ============================================================================ @@ -112,6 +118,78 @@ using bmap_bf16_problem = ck_tile::BlockFmhaPipelineProblem; using bmap_bf16_kernel = ck_tile::SpargeBlockMapKernel; +using kstats_bf16_pipeline = ck_tile::SpargeKStatsPipeline; +using kstats_bf16_kernel = ck_tile::SpargeKStatsKernel; + +// ============================================================================ +// Internal K-stat workspace (R20): process-lifetime lazy hipMalloc, sized +// to the largest (batch, nhead_k, N_k, D) seen so far. Caller API unchanged. +// ============================================================================ + +namespace { + +struct KStatsWorkspace +{ + void* pooled_k_dev = nullptr; // [batch, nhead_k, N_k, D] fp32 + void* sim_k_dev = nullptr; // [batch, nhead_k, N_k] uint8 + size_t pooled_k_bytes = 0; + size_t sim_k_bytes = 0; + + void ensure(int batch, int nhead_k, int N_k, int D) + { + const size_t need_p = static_cast(batch) * nhead_k * N_k * D * sizeof(float); + const size_t need_s = static_cast(batch) * nhead_k * N_k * sizeof(uint8_t); + if(need_p > pooled_k_bytes) + { + if(pooled_k_dev != nullptr) (void)hipFree(pooled_k_dev); + (void)hipMalloc(&pooled_k_dev, need_p); + pooled_k_bytes = need_p; + } + if(need_s > sim_k_bytes) + { + if(sim_k_dev != nullptr) (void)hipFree(sim_k_dev); + (void)hipMalloc(&sim_k_dev, need_s); + sim_k_bytes = need_s; + } + } +}; + +KStatsWorkspace& g_kstats_ws() +{ + static KStatsWorkspace ws; + return ws; +} + +template +void launch_kstats_then_blockmap(sparge_blockmap_args args, const ck_tile::stream_config& s) +{ + const int N_k = ck_tile::integer_divide_ceil(args.seqlen_k, BlockMapKernel::kN0); + const int D = BlockMapKernel::D; + auto& ws = g_kstats_ws(); + ws.ensure(args.batch, args.nhead_k, N_k, D); + + // Stage 1: K stats + { + auto [kargs, grids] = + sparge_kstats_create_kargs_and_grids(args, ws.pooled_k_dev, ws.sim_k_dev); + const dim3 blocks = KStatsKernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = KStatsKernel::kBlockPerCu; + ck_tile::make_kernel(KStatsKernel{}, grids, blocks, 0, kargs)( + ck_tile::stream_config{s.stream_id_}); + } + // Stage 2: block_map (reads ws) + { + auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids( + args, ws.pooled_k_dev, ws.sim_k_dev); + const dim3 blocks = BlockMapKernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = BlockMapKernel::kBlockPerCu; + ck_tile::make_kernel(BlockMapKernel{}, grids, blocks, 0, kargs)( + ck_tile::stream_config{s.stream_id_}); + } +} + +} // namespace + // ============================================================================ // Dispatch // ============================================================================ @@ -122,26 +200,20 @@ float sparge_blockmap_fwd(sparge_blockmap_traits traits, { if(traits.data_type == "fp16" && traits.hdim_q == 128) { - using k_ = bmap_fp16_kernel; if(s.log_level_ > 0) std::cout << ", sparge_blockmap_fp16_d128" << std::flush; - auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids(args); - const dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)); + return ck_tile::launch_kernel(s, [=](const ck_tile::stream_config& s_) { + launch_kstats_then_blockmap(args, s_); + }); } if(traits.data_type == "bf16" && traits.hdim_q == 128) { - using k_ = bmap_bf16_kernel; if(s.log_level_ > 0) std::cout << ", sparge_blockmap_bf16_d128" << std::flush; - auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids(args); - const dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)); + return ck_tile::launch_kernel(s, [=](const ck_tile::stream_config& s_) { + launch_kstats_then_blockmap(args, s_); + }); } if(s.log_level_ > 0) @@ -160,23 +232,13 @@ void sparge_blockmap_fwd_oneshot(sparge_blockmap_traits traits, { if(traits.data_type == "fp16" && traits.hdim_q == 128) { - using k_ = bmap_fp16_kernel; - auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids(args); - const dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); + launch_kstats_then_blockmap(args, s); return; } if(traits.data_type == "bf16" && traits.hdim_q == 128) { - using k_ = bmap_bf16_kernel; - auto [kargs, grids] = sparge_blockmap_create_kargs_and_grids(args); - const dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); + launch_kstats_then_blockmap(args, s); return; } diff --git a/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp b/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp index 6eaeb9ea77b..92c32d29e85 100644 --- a/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp +++ b/example/ck_tile/50_sparse_attn/sparge_blockmap_trek.hpp @@ -8,7 +8,9 @@ #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp" #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp" #include "ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp" +#include "ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp" #include "ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp" +#include "ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp" #include "fmha_fwd_trek.hpp" @@ -45,6 +47,15 @@ struct sparge_blockmap_args void* block_map_ptr; void* lut_ptr; void* valid_block_num_ptr; + + // R21A Phase 4 + R21B fix: optional per-head superparams. nullptr => use scalar. + // Buffer sizes match SpargeAttn upstream contract (utils.py:324-328: all sized + // by Headnum=q.size(1)=nhead_q). K-side kernel still indexes [hk] into the + // first nhead_k entries — for MHA equivalent to old [nhead_k] sizing, for + // MQA/GQA aligns to upstream tuned ckpt layout. + const float* simthreshd1_per_head_ptr = nullptr; // size = nhead_q floats (kernel reads [0..nhead_k-1]) + const float* cdfthreshd_per_head_ptr = nullptr; // size = nhead_q floats + const float* topk_per_head_ptr = nullptr; // size = nhead_q floats }; struct sparge_blockmap_traits @@ -57,7 +68,9 @@ struct sparge_blockmap_traits // Create kernel args and grid dimensions // ============================================================================ template -auto sparge_blockmap_create_kargs_and_grids(sparge_blockmap_args args) +auto sparge_blockmap_create_kargs_and_grids(sparge_blockmap_args args, + const void* pooled_k_ws_ptr, + const void* sim_k_ws_ptr) { assert(args.nhead_q % args.nhead_k == 0); auto kargs = BlockMapKernel::MakeKargs(args.q_ptr, @@ -79,12 +92,38 @@ auto sparge_blockmap_create_kargs_and_grids(sparge_blockmap_args args) args.scale, args.block_map_ptr, args.lut_ptr, - args.valid_block_num_ptr); + args.valid_block_num_ptr, + pooled_k_ws_ptr, + sim_k_ws_ptr, + args.topk_per_head_ptr, + args.cdfthreshd_per_head_ptr); dim3 grids = BlockMapKernel::GridSize(args.batch, args.nhead_q, args.seqlen_q); return ck_tile::make_tuple(kargs, grids); } +template +auto sparge_kstats_create_kargs_and_grids(sparge_blockmap_args args, + void* pooled_k_ws_ptr, + void* sim_k_ws_ptr) +{ + assert(args.nhead_q % args.nhead_k == 0); + auto kargs = KStatsKernel::MakeKargs(args.k_ptr, + args.seqlen_k, + args.hdim_q, + args.nhead_k, + args.stride_k, + args.nhead_stride_k, + args.batch_stride_k, + args.simthreshd1, + pooled_k_ws_ptr, + sim_k_ws_ptr, + args.simthreshd1_per_head_ptr); + + dim3 grids = KStatsKernel::GridSize(args.batch, args.nhead_k, args.seqlen_k); + return ck_tile::make_tuple(kargs, grids); +} + // ============================================================================ // Hand-written template instantiation dispatch // ============================================================================ diff --git a/example/ck_tile/50_sparse_attn/test_sparge.cpp b/example/ck_tile/50_sparse_attn/test_sparge.cpp index 81a49ca006b..4c97a10d0f0 100644 --- a/example/ck_tile/50_sparse_attn/test_sparge.cpp +++ b/example/ck_tile/50_sparse_attn/test_sparge.cpp @@ -105,7 +105,10 @@ auto create_args(int argc, char* argv[]) .insert("seed", "42", "random seed") .insert("warmup", "5", "warmup iterations") .insert("repeat", "20", "benchmark iterations") - .insert("kname", "0", "print kernel name"); + .insert("kname", "0", "print kernel name") + .insert("perhead", "0", + "R21A Phase 4: 0=scalar (default), 1=per-head [H] superparam test " + "(varies topk[h] = topk * (1 + 0.5*(h - H/2)/H), simthreshd1 unchanged)"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -135,6 +138,7 @@ bool run_test(const ck_tile::ArgParser& arg_parser) int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); int kname = arg_parser.get_int("kname"); + int perhead = arg_parser.get_int("perhead"); if(nhead_k < 0) nhead_k = nhead; if(seqlen_k < 0) seqlen_k = seqlen_q; @@ -231,6 +235,33 @@ bool run_test(const ck_tile::ArgParser& arg_parser) bmap_args.lut_ptr = (pipeline == "vsa") ? lut_dev.GetDeviceBuffer() : nullptr; bmap_args.valid_block_num_ptr = (pipeline == "vsa") ? valid_bn_dev.GetDeviceBuffer() : nullptr; + // R21A Phase 4 + R21B fix: per-head superparam buffers, all sized [nhead_q] + // to match SpargeAttn upstream contract (utils.py:324-328, Headnum=q.size(1)). + // K-side kernel reads only the first nhead_k entries via [hk]. + ck_tile::DeviceMem topk_per_head_dev(static_cast(nhead) * sizeof(float)); + ck_tile::DeviceMem sim1_per_head_dev(static_cast(nhead) * sizeof(float)); + ck_tile::DeviceMem cdf_per_head_dev (static_cast(nhead) * sizeof(float)); + if(perhead != 0) + { + std::vector topk_h(nhead); + std::vector sim1_h(nhead); + std::vector cdf_h (nhead); + for(int h = 0; h < nhead; ++h) + { + // small per-head jitter around scalar topk so sparsity differs by head + const float jitter = 0.5f * (static_cast(h - nhead / 2) / nhead); + topk_h[h] = topk * (1.0f + jitter); + sim1_h[h] = simthreshd1; // bit-identical to scalar (kernel reads [0..nhead_k-1]) + cdf_h[h] = cdfthreshd; + } + topk_per_head_dev.ToDevice(topk_h.data()); + sim1_per_head_dev.ToDevice(sim1_h.data()); + cdf_per_head_dev .ToDevice(cdf_h.data()); + bmap_args.topk_per_head_ptr = static_cast(topk_per_head_dev.GetDeviceBuffer()); + bmap_args.simthreshd1_per_head_ptr = static_cast(sim1_per_head_dev.GetDeviceBuffer()); + bmap_args.cdfthreshd_per_head_ptr = static_cast(cdf_per_head_dev.GetDeviceBuffer()); + } + // ---- build attention args ---- ck_tile::stream_config stream_cfg; stream_cfg.stream_id_ = nullptr; diff --git a/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp b/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp index ca177abf23a..62b5b3591c0 100644 --- a/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp +++ b/include/ck_tile/ops/sparse_attn/kernel/sparge_blockmap_kernel.hpp @@ -52,7 +52,20 @@ struct SpargeBlockMapKernel void* lut_ptr; void* valid_block_num_ptr; + // R20 K-stat workspace from Kernel A + const void* pooled_k_ws_ptr; // [batch, nhead_k, N_k, D] fp32 + const void* sim_k_ws_ptr; // [batch, nhead_k, N_k] uint8 + index_t N_k; + + // R21A Phase 4: optional per-head topk (size = nhead_q floats). + // nullptr => use scalar `topk` for all heads. + const float* topk_per_head; + + // R21B: optional per-head cdfthreshd (size = nhead_q floats). + // nullptr => use scalar `cdfthreshd` for all heads. + // Only consulted on topk<=0 path; bench currently always uses topk path. + const float* cdfthreshd_per_head; }; CK_TILE_HOST static constexpr auto MakeKargs(const void* q_ptr, @@ -74,7 +87,11 @@ struct SpargeBlockMapKernel float scale, void* block_map_ptr, void* lut_ptr, - void* valid_block_num_ptr) + void* valid_block_num_ptr, + const void* pooled_k_ws_ptr, + const void* sim_k_ws_ptr, + const float* topk_per_head = nullptr, + const float* cdfthreshd_per_head = nullptr) { const index_t N_k = integer_divide_ceil(seqlen_k, kN0); return Kargs{q_ptr, @@ -97,7 +114,11 @@ struct SpargeBlockMapKernel block_map_ptr, lut_ptr, valid_block_num_ptr, - N_k}; + pooled_k_ws_ptr, + sim_k_ws_ptr, + N_k, + topk_per_head, + cdfthreshd_per_head}; } CK_TILE_HOST static constexpr auto GridSize(index_t batch, index_t nhead_q, index_t seqlen_q) @@ -174,6 +195,21 @@ struct SpargeBlockMapKernel // Shared memory __shared__ char smem[Pipeline::GetSmemSize()]; + // R20 K-stat workspace: pre-offset for this (b, hk). + const index_t nhead_k = kargs.nhead_q / kargs.nhead_ratio_qk; + const index_t khead_off = (b * nhead_k + hk) * N_k; + const auto* pooled_k_ws = + reinterpret_cast(kargs.pooled_k_ws_ptr) + khead_off * D; + const auto* sim_k_ws = + reinterpret_cast(kargs.sim_k_ws_ptr) + khead_off; + + // R21A Phase 4: per-head topk if provided, else scalar broadcast. + const float topk_eff = + (kargs.topk_per_head != nullptr) ? kargs.topk_per_head[hq] : kargs.topk; + // R21B: per-head cdfthreshd if provided, else scalar broadcast. + const float cdfthreshd_eff = + (kargs.cdfthreshd_per_head != nullptr) ? kargs.cdfthreshd_per_head[hq] : kargs.cdfthreshd; + Pipeline{}(q_window, k_window, kargs.seqlen_q, @@ -182,12 +218,14 @@ struct SpargeBlockMapKernel N_k, kargs.nhead_ratio_qk, kargs.simthreshd1, - kargs.cdfthreshd, - kargs.topk, + cdfthreshd_eff, + topk_eff, kargs.scale, bmap_ptr, lut_out, valid_out, + pooled_k_ws, + sim_k_ws, static_cast(smem)); } }; diff --git a/include/ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp b/include/ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp new file mode 100644 index 00000000000..3ce494f8702 --- /dev/null +++ b/include/ck_tile/ops/sparse_attn/kernel/sparge_kstats_kernel.hpp @@ -0,0 +1,136 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once + +#include "ck_tile/core.hpp" +#include + +namespace ck_tile { + +// Kernel A wrapper: grid (N_k, nhead_k, batch). Each work-group precomputes +// K-block stats (pooled_k_mean[D], sim_k) for one (b, hk, kb) into a workspace +// that Kernel B (block_map) reads instead of recomputing per Q-block. +template +struct SpargeKStatsKernel +{ + using Pipeline = remove_cvref_t; + + static constexpr index_t kBlockSize = Pipeline::kBlockSize; + static constexpr index_t kBlockPerCu = Pipeline::kBlockPerCu; + + using QDataType = typename Pipeline::QDataType; + using KDataType = typename Pipeline::KDataType; + + static constexpr index_t kN0 = Pipeline::kN0; + static constexpr index_t D = Pipeline::D; + + static constexpr index_t kAlignment = 16 / sizeof(KDataType); + + struct Kargs + { + const void* k_ptr; + + index_t seqlen_k; + index_t hdim_q; + index_t nhead_k; + + index_t stride_k; + index_t nhead_stride_k; + index_t batch_stride_k; + + float simthreshd1; + + void* pooled_k_ptr; // [batch, nhead_k, N_k, D] fp32 + void* sim_k_ptr; // [batch, nhead_k, N_k] uint8 + + index_t N_k; + + // R21A Phase 4 + R21B fix: optional per-head simthreshd1. + // Buffer is sized [nhead_q] floats to match SpargeAttn upstream contract + // (utils.py:324, Headnum=q.size(1)). Kernel only indexes the first + // nhead_k entries via [hk]. nullptr => use scalar `simthreshd1`. + const float* simthreshd1_per_head; + }; + + CK_TILE_HOST static constexpr auto MakeKargs(const void* k_ptr, + index_t seqlen_k, + index_t hdim_q, + index_t nhead_k, + index_t stride_k, + index_t nhead_stride_k, + index_t batch_stride_k, + float simthreshd1, + void* pooled_k_ptr, + void* sim_k_ptr, + const float* simthreshd1_per_head = nullptr) + { + const index_t N_k = integer_divide_ceil(seqlen_k, kN0); + return Kargs{k_ptr, + seqlen_k, + hdim_q, + nhead_k, + stride_k, + nhead_stride_k, + batch_stride_k, + simthreshd1, + pooled_k_ptr, + sim_k_ptr, + N_k, + simthreshd1_per_head}; + } + + CK_TILE_HOST static constexpr auto GridSize(index_t batch, index_t nhead_k, index_t seqlen_k) + { + const index_t N_k = integer_divide_ceil(seqlen_k, kN0); + return dim3(N_k, nhead_k, batch); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); } + + CK_TILE_DEVICE void operator()(Kargs kargs) const + { + const index_t kb = static_cast(blockIdx.x); + const index_t hk = static_cast(blockIdx.y); + const index_t b = static_cast(blockIdx.z); + + const auto* k_base = reinterpret_cast(kargs.k_ptr) + + b * kargs.batch_stride_k + hk * kargs.nhead_stride_k + + kb * kN0 * kargs.stride_k; + + const auto k_dram_naive = make_naive_tensor_view( + k_base, + make_tuple(kargs.seqlen_k - kb * kN0, D), + make_tuple(kargs.stride_k, 1), + number{}, + number<1>{}); + const auto k_dram = pad_tensor_view( + k_dram_naive, make_tuple(number{}, number{}), sequence{}); + + auto k_window = make_tile_window(k_dram, + make_tuple(number{}, number{}), + {0, 0}, + Pipeline::MakeKBlockDistribution()); + + const index_t N_k = kargs.N_k; + const index_t khead_off = (b * kargs.nhead_k + hk) * N_k; + auto* pooled_k_out = reinterpret_cast(kargs.pooled_k_ptr) + (khead_off + kb) * D; + auto* sim_k_out = reinterpret_cast(kargs.sim_k_ptr) + (khead_off + kb); + + __shared__ char smem[Pipeline::GetSmemSize()]; + + // R21A Phase 4: per-head simthreshd1 if provided, else scalar broadcast. + const float simthreshd1_eff = (kargs.simthreshd1_per_head != nullptr) + ? kargs.simthreshd1_per_head[hk] + : kargs.simthreshd1; + + Pipeline{}(k_window, + kargs.seqlen_k, + kb, + simthreshd1_eff, + pooled_k_out, + sim_k_out, + static_cast(smem)); + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp b/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp index 222e73c60e2..25e3b964e93 100644 --- a/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp +++ b/include/ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp @@ -32,14 +32,22 @@ struct SpargeBlockMapPipeline static constexpr index_t kMaxKBlocks = 1024; // LDS layout (non-overlapping, all used simultaneously in Phase 2): - // [0 .. kReduceBytes) cross-warp reduction scratch - // [kScoreOffset ..) scores[N_k] - // [kBmapOffset ..) block_map[N_k] - // [kSmallOffset ..) Phase 3 argmax scratch (2*NumWarps floats) - static constexpr index_t kReduceBytes = NumWarps * D * sizeof(float); - static constexpr index_t kScoreOffset = kReduceBytes; - static constexpr index_t kBmapOffset = kScoreOffset + kMaxKBlocks * sizeof(float); - static constexpr index_t kSmallOffset = kBmapOffset + kMaxKBlocks * sizeof(uint8_t); + // [0 .. kReduceBytes) cross-warp reduction scratch slab 0 + // [kReduceBytes .. 2*kReduceBytes) cross-warp reduction scratch slab 1 + // (Round 8 b1: ping-pong for K-loop double buffer) + // [kScoreOffset ..) scores[N_k] + // [kBmapOffset ..) block_map[N_k] + // [kSmallOffset ..) Phase 3 argmax scratch (2*NumWarps floats) + // B2.v3 column-stride pad: replace k_idx*KPerThread with k_idx*(KPerThread+1) + // to break the 4-way intra-warp bank conflict. New per-warp slab size: + // KThreads * (KPerThread + 1) floats. + static constexpr index_t kColPaddedStride = KPerThread + 1; + static constexpr index_t kPerWarpFloats = KThreads * kColPaddedStride; + static constexpr index_t kReduceBytes = NumWarps * kPerWarpFloats * sizeof(float); + static constexpr index_t kReduceTotalBytes = 2 * kReduceBytes; // Round 8 b1: 2 slabs + static constexpr index_t kScoreOffset = kReduceTotalBytes; + static constexpr index_t kBmapOffset = kScoreOffset + kMaxKBlocks * sizeof(float); + static constexpr index_t kSmallOffset = kBmapOffset + kMaxKBlocks * sizeof(uint8_t); CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { @@ -98,6 +106,12 @@ struct SpargeBlockMapPipeline } // Cross-warp LDS reduction for column sums. + // Round 13f: templated TrailingSync flag. When false, the trailing __syncthreads() + // is dropped — only safe when the next access targets a *different* slab and the + // intervening work does not read smem_reduce. Used at the slab_b call in Phase 2 + // K-loop, where the next iter's first cross-warp reduce writes to slab_a (different + // address) and is preceded by its own leading sync. + template CK_TILE_DEVICE static void column_reduce_cross_warp(float (&col_acc)[KPerThread], float* __restrict__ smem_reduce) { @@ -107,17 +121,21 @@ struct SpargeBlockMapPipeline const index_t k_idx = lane_id % KThreads; const index_t m_idx = lane_id / KThreads; + // B2.v3 column-stride pad: stride k_idx by (KPerThread+1)=9 instead of 8, + // changing per-lane bank from (k_idx*8+k)%32 to (k_idx*9+k)%32. For k=0, + // lanes (k_idx={0,4,8,12}) now hit banks {0,4,8,12} instead of all 0. if(m_idx == 0) for(index_t k = 0; k < KPerThread; ++k) - smem_reduce[warp_id * D + k_idx * KPerThread + k] = col_acc[k]; + smem_reduce[warp_id * kPerWarpFloats + k_idx * kColPaddedStride + k] = col_acc[k]; __syncthreads(); for(index_t k = 0; k < KPerThread; ++k) col_acc[k] = 0.f; for(index_t w = 0; w < NumWarps; ++w) for(index_t k = 0; k < KPerThread; ++k) - col_acc[k] += smem_reduce[w * D + k_idx * KPerThread + k]; - __syncthreads(); + col_acc[k] += smem_reduce[w * kPerWarpFloats + k_idx * kColPaddedStride + k]; + if constexpr(TrailingSync) + __syncthreads(); } // Compute ||v||^2 per row: sum along KPerThread then xor-shuffle across k_idx. @@ -162,7 +180,8 @@ struct SpargeBlockMapPipeline for(index_t m = 0; m < SeqPerThread; ++m) { - float inv_norm = (row_norms[m] > 0.f) ? (1.0f / __builtin_sqrtf(row_norms[m])) : 0.f; + // Round 12: hardware fast rsqrt (v_rsq_f32, ~1 ULP) replaces sw sqrt+rcp. + float inv_norm = (row_norms[m] > 0.f) ? rsqrtf(row_norms[m]) : 0.f; index_t gsq = m * (SeqThreadPerWarp * NumWarps) + warp_id * SeqThreadPerWarp + m_idx; if(gsq < actual_seq) for(index_t k = 0; k < KPerThread; ++k) @@ -230,9 +249,9 @@ struct SpargeBlockMapPipeline // ====================================================================== template CK_TILE_DEVICE void operator()(const QWindowType& q_window_in, - const KWindowType& k_window_in, + const KWindowType& /*k_window_in*/, index_t seqlen_q, - index_t seqlen_k, + index_t /*seqlen_k*/, index_t qb, index_t N_k, index_t /*nhead_ratio_qk*/, @@ -243,11 +262,15 @@ struct SpargeBlockMapPipeline uint8_t* block_map_ptr, int32_t* lut_ptr, int32_t* valid_block_num_ptr, + const float* __restrict__ pooled_k_ws_ptr, + const uint8_t* __restrict__ sim_k_ws_ptr, void* smem_ptr) const { const index_t tid = static_cast(threadIdx.x); - auto* smem_float = reinterpret_cast(smem_ptr); + // R20: K-loop no longer reduces, only Phase 1 uses smem_float0. + // smem_float1 slab is allocated for layout compat but unused. + auto* smem_float0 = reinterpret_cast(smem_ptr); auto* smem_scores = reinterpret_cast(reinterpret_cast(smem_ptr) + kScoreOffset); auto* smem_bmap = @@ -271,16 +294,22 @@ struct SpargeBlockMapPipeline row_reduce_sq_norm(q_data, psq, bs_q); // 1b. Column sum -> mean + // Track F (re-apply R8 b2): drop trailing sync. Next reduce reuses same slab + // (smem_float0) and has its own leading __syncthreads() before reading. + // pooled_q_mean is register-only between reduces. float pooled_q_mean[KPerThread]; column_reduce_thread_and_warp(q_data, pooled_q_mean); - column_reduce_cross_warp(pooled_q_mean, smem_float); + column_reduce_cross_warp(pooled_q_mean, smem_float0); for(index_t k = 0; k < KPerThread; ++k) pooled_q_mean[k] *= inv_bs_q; // 1c. Normalised sum_hat + // Track F (re-apply R8 b2): drop trailing sync. Next cross-warp reduce in + // K-loop iter 0 writes slab_a=smem_float0 (kb=0 even). Although same slab, + // its leading __syncthreads() covers the WAR. sum_hat register-only here. float sum_hat[KPerThread]; column_reduce_normalised(q_data, psq, sum_hat, bs_q); - column_reduce_cross_warp(sum_hat, smem_float); + column_reduce_cross_warp(sum_hat, smem_float0); // 1d. sim_q = ||sum_hat||^2 / bs_q^2 float sh_sq = 0.f; @@ -319,49 +348,34 @@ struct SpargeBlockMapPipeline smem_bmap[i] = 0; __syncthreads(); - auto k_window = k_window_in; + // R20: K-stats precomputed by Kernel A. Each thread loads its own + // KPerThread-slice of pooled_k_mean from DRAM workspace; sim_k is a single + // byte. No K-tile load, no cross-warp reduce in the K-loop. + const index_t lane_id_kb = tid % WarpSize; + const index_t k_idx_kb = lane_id_kb % KThreads; for(index_t kb = 0; kb < N_k; ++kb) { - const index_t bs_k = min(static_cast(kN0), seqlen_k - kb * kN0); - const float inv_bs_k = (bs_k > 0) ? (1.0f / static_cast(bs_k)) : 0.f; - - auto k_tile = load_tile(k_window); - - float k_data[NPerThread * KPerThread]; - tile_to_float(k_tile, k_data); - - // K mean + const float* p_kb = pooled_k_ws_ptr + kb * D + k_idx_kb * KPerThread; float pooled_k_mean[KPerThread]; - column_reduce_thread_and_warp(k_data, pooled_k_mean); - column_reduce_cross_warp(pooled_k_mean, smem_float); for(index_t k = 0; k < KPerThread; ++k) - pooled_k_mean[k] *= inv_bs_k; + pooled_k_mean[k] = p_kb[k]; - // dot(pooled_q_mean, pooled_k_mean) float dot = 0.f; for(index_t k = 0; k < KPerThread; ++k) dot += pooled_q_mean[k] * pooled_k_mean[k]; dot = reduce_across_k(dot); - // K L2 norms + normalised sum_hat - float k_psq[NPerThread]; - row_reduce_sq_norm(k_data, k_psq, bs_k); - - float k_sum_hat[KPerThread]; - column_reduce_normalised(k_data, k_psq, k_sum_hat, bs_k); - column_reduce_cross_warp(k_sum_hat, smem_float); - - // sim_k - float ksh_sq = 0.f; - for(index_t k = 0; k < KPerThread; ++k) - ksh_sq += k_sum_hat[k] * k_sum_hat[k]; - ksh_sq = reduce_across_k(ksh_sq); - const float denom_k = static_cast(bs_k) * static_cast(bs_k); - const bool sim_k = (denom_k > 0.f) && ((ksh_sq / denom_k) > simthreshd1); + const bool sim_k = (sim_k_ws_ptr[kb] != 0); if(tid == 0) { + // INVARIANT (mirrors SpargeAttn ref utils.py:175-180): + // ~sim_k blocks are forced ON in the bitmap (final_map[~sim_k]=1) + // AND have score = -inf so Phase 3 selection (topk / cdf) does NOT + // pick them again (would double-count toward topk budget). + // Both writes MUST stay together. Any Phase 3 selection rewrite + // (e.g. iterative argmax → bitonic sort) must keep the -inf write. if(!sim_k) { smem_bmap[kb] = 1; @@ -372,10 +386,8 @@ struct SpargeBlockMapPipeline smem_scores[kb] = dot * scale; } } - __syncthreads(); - - move_tile_window(k_window, {kN0, 0}); } + __syncthreads(); // guard Phase 3's reads of smem_bmap / smem_scores // ================================================================== // Phase 3: Softmax + Selection @@ -399,15 +411,24 @@ struct SpargeBlockMapPipeline } const float sum_exp = block_reduce_sum(lsum, smem_small); - // normalise - const float inv_sum = (sum_exp > 0.f) ? (1.0f / sum_exp) : 0.f; - for(index_t i = tid; i < N_k; i += kBlockSize) - smem_scores[i] *= inv_sum; - __syncthreads(); + // Round 13i: argmax is invariant under positive scaling (inv_sum > 0). When + // topk > 0 we never read normalised values for cdfthreshd, so skip the + // normalise pass entirely (saves N_k LDS writes + 1 __syncthreads). The + // cdfthreshd path (topk <= 0) still requires normalised scores so the + // accumulator `cumulative_prob` matches probabilities. + const bool topk_active = (topk > 0.f); + const float inv_sum = + (!topk_active && sum_exp > 0.f) ? (1.0f / sum_exp) : 0.f; + if(!topk_active) + { + for(index_t i = tid; i < N_k; i += kBlockSize) + smem_scores[i] *= inv_sum; + __syncthreads(); + } // Selection: iterative argmax index_t num_to_select = - (topk > 0.f) + topk_active ? max(static_cast(1), static_cast(topk * static_cast(N_k))) : N_k; @@ -448,6 +469,11 @@ struct SpargeBlockMapPipeline } __syncthreads(); + // Round 13g: collapse 2 syncs/round into 1. tid==0 computes the global + // winner AND writes the sentinel (smem_bmap=1, smem_scores=-1) in the same + // critical section, gated by bv>0. All threads then read smem_small[0] for + // the early break / cumulative_prob accumulation. Saves 1 __syncthreads per + // round (~32 syncs @ N_k=64 topk=0.5). if(tid == 0) { float bv = smem_small[0]; @@ -462,24 +488,22 @@ struct SpargeBlockMapPipeline bi = wi; } } + // Write sentinel into bmap/scores in the same critical section. + // Guarded by bv > 0 so we never poison a valid score with -1. + if(bv > 0.f) + { + smem_bmap[bi] = 1; + smem_scores[bi] = -1.f; + } smem_small[0] = bv; - smem_small[1] = bit_cast(static_cast(bi)); } __syncthreads(); - float g_val = smem_small[0]; - index_t g_idx = bit_cast(smem_small[1]); + float g_val = smem_small[0]; if(g_val <= 0.f) break; - if(tid == 0) - { - smem_bmap[g_idx] = 1; - smem_scores[g_idx] = -1.f; - } - __syncthreads(); - if(topk > 0.f) { if(round + 1 >= num_to_select) diff --git a/include/ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp b/include/ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp new file mode 100644 index 00000000000..1cb96d716a3 --- /dev/null +++ b/include/ck_tile/ops/sparse_attn/pipeline/sparge_kstats_pipeline.hpp @@ -0,0 +1,110 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/sparse_attn/pipeline/sparge_blockmap_pipeline.hpp" + +namespace ck_tile { + +// Kernel A of the K-stat precompute split: one work-group per (b, hk, kb) +// computes pooled_k_mean and sim_k for that K-block once. Kernel B then reads +// from the workspace instead of recomputing per Q-block. +template +struct SpargeKStatsPipeline +{ + using Problem = remove_cvref_t; + using Base = SpargeBlockMapPipeline; + using QDataType = typename Base::QDataType; + using KDataType = typename Base::KDataType; + + static constexpr index_t kBlockSize = Base::kBlockSize; + static constexpr index_t kM0 = Base::kM0; + static constexpr index_t kN0 = Base::kN0; + static constexpr index_t D = Base::D; + static constexpr index_t NumWarps = Base::NumWarps; + static constexpr index_t WarpSize = Base::WarpSize; + + static constexpr index_t KPerThread = Base::KPerThread; + static constexpr index_t KThreads = Base::KThreads; + static constexpr index_t SeqThreadPerWarp = Base::SeqThreadPerWarp; + static constexpr index_t NPerThread = Base::NPerThread; + + static constexpr index_t kBlockPerCu = 1; + + static constexpr index_t kColPaddedStride = Base::kColPaddedStride; + static constexpr index_t kPerWarpFloats = Base::kPerWarpFloats; + static constexpr index_t kReduceBytes = NumWarps * kPerWarpFloats * sizeof(float); + + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return kReduceBytes; } + + CK_TILE_HOST_DEVICE static constexpr auto MakeKBlockDistribution() + { + return Base::MakeKBlockDistribution(); + } + + // operator(): one work-group, one K-block. Writes D fp32 + 1 uint8 to workspace. + template + CK_TILE_DEVICE void operator()(const KWindowType& k_window, + index_t seqlen_k, + index_t kb, + float simthreshd1, + float* __restrict__ pooled_k_out, // D floats + uint8_t* __restrict__ sim_k_out, // 1 byte + void* smem_ptr) const + { + const index_t tid = static_cast(threadIdx.x); + auto* smem_reduce = reinterpret_cast(smem_ptr); + + const index_t bs_k = min(static_cast(kN0), seqlen_k - kb * kN0); + const float inv_bs_k = (bs_k > 0) ? (1.0f / static_cast(bs_k)) : 0.f; + + auto k_tile = load_tile(k_window); + + float k_data[NPerThread * KPerThread]; + Base::template tile_to_float(k_tile, k_data); + + const index_t warp_id = tid / WarpSize; + const index_t lane_id = tid % WarpSize; + const index_t k_idx = lane_id % KThreads; + const index_t m_idx = lane_id / KThreads; + + // pooled_k_mean: column sum then cross-warp reduce. + // R21A: drop trailing sync (next cross_warp_reduce has its own leading sync). + float pooled_k_mean[KPerThread]; + Base::template column_reduce_thread_and_warp(k_data, pooled_k_mean); + Base::template column_reduce_cross_warp(pooled_k_mean, smem_reduce); + for(index_t k = 0; k < KPerThread; ++k) + pooled_k_mean[k] *= inv_bs_k; + + // R21A: write pooled_k_mean to global early so its register liveness ends here, + // freeing VGPR before k_sum_hat becomes live. + if(warp_id == 0 && m_idx == 0) + { + for(index_t k = 0; k < KPerThread; ++k) + pooled_k_out[k_idx * KPerThread + k] = pooled_k_mean[k]; + } + + // K row L2 norms + normalised column sum (k_sum_hat) + float k_psq[NPerThread]; + Base::template row_reduce_sq_norm(k_data, k_psq, bs_k); + + float k_sum_hat[KPerThread]; + Base::template column_reduce_normalised(k_data, k_psq, k_sum_hat, bs_k); + // R21A: drop trailing sync (no further smem read; only intra-warp shuffle + global write). + Base::template column_reduce_cross_warp(k_sum_hat, smem_reduce); + + // sim_k = (||k_sum_hat||^2 / bs_k^2) > simthreshd1 + float ksh_sq = 0.f; + for(index_t k = 0; k < KPerThread; ++k) + ksh_sq += k_sum_hat[k] * k_sum_hat[k]; + ksh_sq = Base::reduce_across_k(ksh_sq); + const float denom_k = static_cast(bs_k) * static_cast(bs_k); + const bool sim_k = (denom_k > 0.f) && ((ksh_sq / denom_k) > simthreshd1); + + if(tid == 0) + *sim_k_out = sim_k ? static_cast(1) : static_cast(0); + } +}; + +} // namespace ck_tile