Tested on the GPU RTX4090 with cuda 12.x

Alwaysproblem · Alwaysproblem · commit 45a5e0112e54 · 2026-03-05T08:05:07.000Z
diff --git a/mlir/cuda-tile/.gitignore b/mlir/cuda-tile/.gitignore
@@ -1,3 +1,18 @@
 *.ptx
 *.cubin
 *.fatbin
+*.bc
+*.ll
+*.o
+*.s
+*.so
+*.dylib
+*.a
+*.dll
+*.obj
+*.exe
+*.log
+*.cache
+*.tmp
+*.bin
+*.out
diff --git a/mlir/cuda-tile/README.md b/mlir/cuda-tile/README.md
diff --git a/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp b/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp
@@ -330,7 +330,7 @@ cuda_shim_load_module_from_file(uint64_t file_path_ptr,
                                 uint64_t /*file_path_nbytes*/) {
   auto file_path_cstr =
       reinterpret_cast<const char *>(asHostCPtr(file_path_ptr));
-  // fprintf(stdout, "%s", file_path_cstr);
+  debug_print("Loading CUDA module from file: %s\n", file_path_cstr);
   CUmodule module = nullptr;
   ScopedContext scopedContext;
   CUDA_REPORT_IF_ERROR(cuModuleLoad(&module, file_path_cstr));
@@ -519,7 +519,7 @@ extern "C" void cuda_shim_ctx_synchronize(void) { mgpuCtxSynchronize(); }
 
 // only for debugging
 extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) {
-  auto *p = reinterpret_cast<const float*>(static_cast<uintptr_t>(dptr));
+  auto *p = reinterpret_cast<const float *>(static_cast<uintptr_t>(dptr));
   for (uint32_t i = 0; i < n; ++i) {
     fprintf(stderr, "i=%u v=%f\n", i, p[i]);
   }
diff --git a/mlir/cuda-tile/Toy/include/cuda_shim/CudaShimBuilder.hpp b/mlir/cuda-tile/Toy/include/cuda_shim/CudaShimBuilder.hpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -296,3 +297,134 @@ inline unsigned long getNbytes(mlir::Type tensorType) {
                               ranked_tensor_type.getElementTypeBitWidth(),
                           8);
 }
+
+extern "C" {
+// Load module from PTX or CUBIN image in memory.
+// Driver API supports cuModuleLoadDataEx for both PTX and cubin (it
+// auto-detects).
+uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr,
+                                          uint64_t image_nbytes);
+uint64_t cuda_shim_load_module_jit_from_image(uint64_t image_ptr,
+                                              uint64_t image_nbytes,
+                                              int opt_level);
+
+uint64_t cuda_shim_load_module_from_file(uint64_t file_path_ptr,
+                                         uint64_t /*file_path_nbytes*/);
+
+void cuda_shim_unload_module(uint64_t module_handle);
+
+uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream,
+                          bool is_host_shared);
+
+void cuda_shim_free(uint64_t dptr, uint64_t stream);
+
+void cuda_shim_memset32(uint64_t dptr, uint32_t value, uint64_t count_dwords,
+                        uint64_t stream);
+void cuda_shim_memset16(uint64_t dptr, uint32_t value, uint64_t count_dwords,
+                        uint64_t stream);
+
+uint64_t cuda_shim_stream_create(void);
+
+void cuda_shim_stream_destroy(uint64_t stream);
+
+void cuda_shim_stream_synchronize(uint64_t stream);
+
+uint64_t cuda_shim_event_create(void);
+
+void cuda_shim_event_destroy(uint64_t ev);
+
+void cuda_shim_event_record(uint64_t ev, uint64_t stream);
+
+void cuda_shim_event_synchronize(uint64_t ev);
+
+void cuda_shim_stream_wait_event(uint64_t stream, uint64_t ev);
+
+// ----------------------------- Memcpy (raw ABI) --------------------------
+// Host pointers are passed as uint64_t. This is the key of 2A.
+
+void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr,
+                          uint64_t nbytes);
+
+void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr,
+                          uint64_t nbytes);
+
+void cuda_shim_launch_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
+                             uint32_t gridX, uint32_t gridY, uint32_t gridZ,
+                             uint32_t blockX, uint32_t blockY, uint32_t blockZ,
+                             uint32_t sharedMemBytes, uint64_t stream,
+                             uint64_t arg_data_ptr, uint64_t arg_sizes_ptr,
+                             uint32_t num_args);
+
+// Convenience: 1D launch, shared=0, stream optional
+void cuda_shim_launch_block_packed(uint64_t module_handle,
+                                   uint64_t kernel_name_ptr, uint32_t blockX,
+                                   uint32_t blockY, uint32_t blockZ,
+                                   uint64_t stream, uint64_t arg_data_ptr,
+                                   uint64_t arg_sizes_ptr, uint32_t num_args);
+
+// Optional: global sync (avoid in async pipeline; prefer event/stream sync)
+void cuda_shim_ctx_synchronize(void);
+
+// only for debugging
+void cuda_debug_dump_float(uint64_t dptr, int n);
+}
+
+static inline llvm::orc::SymbolMap
+buildCudaShimSymbolMap(llvm::orc::MangleAndInterner interner) {
+
+  using llvm::JITSymbolFlags;
+  using llvm::orc::ExecutorAddr;
+  using llvm::orc::ExecutorSymbolDef;
+  using llvm::orc::SymbolMap;
+
+  SymbolMap syms;
+
+  auto add = [&](const char *name, void *addr) {
+    syms[interner(name)] =
+        ExecutorSymbolDef::fromPtr(addr, JITSymbolFlags::Exported);
+  };
+
+  // ---- ctx ----
+  add("cuda_shim_ctx_synchronize", (void *)&cuda_shim_ctx_synchronize);
+
+  // ---- module ----
+  add("cuda_shim_load_module_from_image",
+      (void *)&cuda_shim_load_module_from_image);
+  add("cuda_shim_load_module_jit_from_image",
+      (void *)&cuda_shim_load_module_jit_from_image);
+  add("cuda_shim_load_module_from_file",
+      (void *)&cuda_shim_load_module_from_file);
+  add("cuda_shim_unload_module", (void *)&cuda_shim_unload_module);
+
+  // ---- memory ----
+  add("cuda_shim_malloc", (void *)&cuda_shim_malloc);
+  add("cuda_shim_free", (void *)&cuda_shim_free);
+
+  // ---- memcpy ----
+  add("cuda_shim_memcpy_h2d", (void *)&cuda_shim_memcpy_h2d);
+  add("cuda_shim_memcpy_d2h", (void *)&cuda_shim_memcpy_d2h);
+
+  // ---- stream ----
+  add("cuda_shim_stream_create", (void *)&cuda_shim_stream_create);
+  add("cuda_shim_stream_destroy", (void *)&cuda_shim_stream_destroy);
+  add("cuda_shim_stream_synchronize", (void *)&cuda_shim_stream_synchronize);
+
+  // ---- event ----
+  add("cuda_shim_event_create", (void *)&cuda_shim_event_create);
+  add("cuda_shim_event_destroy", (void *)&cuda_shim_event_destroy);
+  add("cuda_shim_event_record", (void *)&cuda_shim_event_record);
+  add("cuda_shim_event_synchronize", (void *)&cuda_shim_event_synchronize);
+  add("cuda_shim_stream_wait_event", (void *)&cuda_shim_stream_wait_event);
+
+  // ---- launch ----
+  add("cuda_shim_launch_packed", (void *)&cuda_shim_launch_packed);
+  add("cuda_shim_launch_block_packed", (void *)&cuda_shim_launch_block_packed);
+
+  return syms;
+}
+
+static inline void registerCudaShimSymbols(mlir::ExecutionEngine &engine) {
+  engine.registerSymbols([](llvm::orc::MangleAndInterner interner) {
+    return buildCudaShimSymbolMap(interner);
+  });
+}
diff --git a/mlir/cuda-tile/Toy/include/toy/Passes.h b/mlir/cuda-tile/Toy/include/toy/Passes.h
@@ -34,9 +34,9 @@ std::unique_ptr<mlir::Pass> createGpuOutlinePass(std::string grid = "1,1,1");
 
 std::unique_ptr<mlir::Pass> createCudaTileLoweringPass();
 
-std::unique_ptr<mlir::Pass>
-createEmbedCudaTileBinaryPass(std::string tileirasExe = "tileiras",
-                              std::string gpuName = "sm_120");
+std::unique_ptr<mlir::Pass> createEmbedCudaTileBinaryPass(
+    std::string tileirasExe = "tileiras", std::string gpuName = "sm_120",
+    std::string cubinOrPtxPath = "", bool useCache = true);
 
 } // namespace toy
 } // namespace mlir
diff --git a/mlir/cuda-tile/Toy/mlir/EmitCudaTile.cpp b/mlir/cuda-tile/Toy/mlir/EmitCudaTile.cpp
@@ -7,10 +7,12 @@
 #include "toy/Dialect.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DebugLog.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
 #include <system_error>
 
 using namespace llvm;
@@ -84,9 +86,13 @@ struct EmbedCudaTileBinaryPass
 
   std::string tileirasExe;
   std::string gpuName;
+  std::string cubinOrPtxPath;
+  bool useCache;
 
-  EmbedCudaTileBinaryPass(std::string tileirasExe, std::string gpuName)
-      : tileirasExe(std::move(tileirasExe)), gpuName(std::move(gpuName)) {}
+  EmbedCudaTileBinaryPass(std::string tileirasExe, std::string gpuName,
+                          std::string cubinOrPtxPath, bool useCache)
+      : tileirasExe(std::move(tileirasExe)), gpuName(std::move(gpuName)),
+        cubinOrPtxPath(std::move(cubinOrPtxPath)), useCache(useCache) {}
 
   void runOnOperation() override {
     ModuleOp top = getOperation();
@@ -126,13 +132,38 @@ struct EmbedCudaTileBinaryPass
         return;
       }
 
-      if (std::error_code ec =
-              createTemporaryFile(cudaBinPath, "cuda_tile", "bin")) {
-        op->emitError() << "failed to create temp out bin: " << ec.message();
-        signalPassFailure();
+      if (cubinOrPtxPath.empty()) {
+        if (std::error_code ec =
+                createTemporaryFile(cudaBinPath, "cuda_tile", "bin")) {
+          op->emitError() << "failed to create temp out bin: " << ec.message();
+          signalPassFailure();
+          return;
+        }
+      } else {
+        if (!useCache) {
+          if (llvm::sys::fs::exists(cubinOrPtxPath)) {
+            op->emitWarning() << "cuda binary file exist  " << cubinOrPtxPath
+                              << ", tileiras will overwrite it.";
+            std::error_code ec = llvm::sys::fs::remove(cubinOrPtxPath);
+            if (ec) {
+              op->emitError() << "failed to remove existing cuda binary file: "
+                              << ec.message();
+              signalPassFailure();
+              return;
+            }
+          }
+        }
+        cudaBinPath = cubinOrPtxPath;
+      }
+
+      if (useCache && llvm::sys::fs::exists(cudaBinPath)) {
+        LDBG() << "cuda binary file exist and will be reused: " << cudaBinPath
+               << "\n";
         return;
       }
 
+      // ! [FIXME]: please comment out this following code since this is only
+      // for testing.
       if (failed(writeFileBytes(inPath, tilebcBytes))) {
         op->emitError() << "failed to write temp tilebc";
         signalPassFailure();
@@ -145,6 +176,8 @@ struct EmbedCudaTileBinaryPass
       }
     });
 
+    LDBG() << "cuda binary path: " << cudaBinPath << "\n";
+
     top->walk([&](toy::LaunchGpuOp launchOp) {
       // ---- Step D: read cuda binary bytes ----
       auto binBytesOrErr = readFileBytes(cudaBinPath);
@@ -189,8 +222,10 @@ struct EmbedCudaTileBinaryPass
 namespace mlir::toy {
 
 std::unique_ptr<mlir::Pass>
-createEmbedCudaTileBinaryPass(std::string tileirasExe, std::string gpuName) {
-  return std::make_unique<EmbedCudaTileBinaryPass>(tileirasExe, gpuName);
+createEmbedCudaTileBinaryPass(std::string tileirasExe, std::string gpuName,
+                              std::string cubinOrPtxPath, bool useCache) {
+  return std::make_unique<EmbedCudaTileBinaryPass>(tileirasExe, gpuName,
+                                                   cubinOrPtxPath, useCache);
 };
 
 }; // namespace mlir::toy
diff --git a/mlir/cuda-tile/Toy/toyc.cpp b/mlir/cuda-tile/Toy/toyc.cpp

-Original file line number
+Diff line change
@@ @@ -1,3 +1,18 @@ @@
 *.ptx
 *.cubin
 *.fatbin
 +*.bc
 +*.ll
 +*.o
 +*.s
 +*.so
 +*.dylib
 +*.a
 +*.dll
 +*.obj
 +*.exe
 +*.log
 +*.cache
 +*.tmp
 +*.bin
 +*.out