|
9 | 9 | #include "mlir/Dialect/Arith/IR/Arith.h" |
10 | 10 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
11 | 11 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| 12 | +#include "mlir/ExecutionEngine/ExecutionEngine.h" |
12 | 13 | #include "mlir/IR/Builders.h" |
13 | 14 | #include "mlir/IR/BuiltinAttributes.h" |
14 | 15 | #include "mlir/IR/BuiltinOps.h" |
@@ -296,3 +297,134 @@ inline unsigned long getNbytes(mlir::Type tensorType) { |
296 | 297 | ranked_tensor_type.getElementTypeBitWidth(), |
297 | 298 | 8); |
298 | 299 | } |
| 300 | + |
| 301 | +extern "C" { |
| 302 | +// Load module from PTX or CUBIN image in memory. |
| 303 | +// Driver API supports cuModuleLoadDataEx for both PTX and cubin (it |
| 304 | +// auto-detects). |
| 305 | +uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr, |
| 306 | + uint64_t image_nbytes); |
| 307 | +uint64_t cuda_shim_load_module_jit_from_image(uint64_t image_ptr, |
| 308 | + uint64_t image_nbytes, |
| 309 | + int opt_level); |
| 310 | + |
| 311 | +uint64_t cuda_shim_load_module_from_file(uint64_t file_path_ptr, |
| 312 | + uint64_t /*file_path_nbytes*/); |
| 313 | + |
| 314 | +void cuda_shim_unload_module(uint64_t module_handle); |
| 315 | + |
| 316 | +uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream, |
| 317 | + bool is_host_shared); |
| 318 | + |
| 319 | +void cuda_shim_free(uint64_t dptr, uint64_t stream); |
| 320 | + |
| 321 | +void cuda_shim_memset32(uint64_t dptr, uint32_t value, uint64_t count_dwords, |
| 322 | + uint64_t stream); |
| 323 | +void cuda_shim_memset16(uint64_t dptr, uint32_t value, uint64_t count_dwords, |
| 324 | + uint64_t stream); |
| 325 | + |
| 326 | +uint64_t cuda_shim_stream_create(void); |
| 327 | + |
| 328 | +void cuda_shim_stream_destroy(uint64_t stream); |
| 329 | + |
| 330 | +void cuda_shim_stream_synchronize(uint64_t stream); |
| 331 | + |
| 332 | +uint64_t cuda_shim_event_create(void); |
| 333 | + |
| 334 | +void cuda_shim_event_destroy(uint64_t ev); |
| 335 | + |
| 336 | +void cuda_shim_event_record(uint64_t ev, uint64_t stream); |
| 337 | + |
| 338 | +void cuda_shim_event_synchronize(uint64_t ev); |
| 339 | + |
| 340 | +void cuda_shim_stream_wait_event(uint64_t stream, uint64_t ev); |
| 341 | + |
| 342 | +// ----------------------------- Memcpy (raw ABI) -------------------------- |
| 343 | +// Host pointers are passed as uint64_t. This is the key of 2A. |
| 344 | + |
| 345 | +void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr, |
| 346 | + uint64_t nbytes); |
| 347 | + |
| 348 | +void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr, |
| 349 | + uint64_t nbytes); |
| 350 | + |
| 351 | +void cuda_shim_launch_packed(uint64_t module_handle, uint64_t kernel_name_ptr, |
| 352 | + uint32_t gridX, uint32_t gridY, uint32_t gridZ, |
| 353 | + uint32_t blockX, uint32_t blockY, uint32_t blockZ, |
| 354 | + uint32_t sharedMemBytes, uint64_t stream, |
| 355 | + uint64_t arg_data_ptr, uint64_t arg_sizes_ptr, |
| 356 | + uint32_t num_args); |
| 357 | + |
| 358 | +// Convenience: 1D launch, shared=0, stream optional |
| 359 | +void cuda_shim_launch_block_packed(uint64_t module_handle, |
| 360 | + uint64_t kernel_name_ptr, uint32_t blockX, |
| 361 | + uint32_t blockY, uint32_t blockZ, |
| 362 | + uint64_t stream, uint64_t arg_data_ptr, |
| 363 | + uint64_t arg_sizes_ptr, uint32_t num_args); |
| 364 | + |
| 365 | +// Optional: global sync (avoid in async pipeline; prefer event/stream sync) |
| 366 | +void cuda_shim_ctx_synchronize(void); |
| 367 | + |
| 368 | +// only for debugging |
| 369 | +void cuda_debug_dump_float(uint64_t dptr, int n); |
| 370 | +} |
| 371 | + |
| 372 | +static inline llvm::orc::SymbolMap |
| 373 | +buildCudaShimSymbolMap(llvm::orc::MangleAndInterner interner) { |
| 374 | + |
| 375 | + using llvm::JITSymbolFlags; |
| 376 | + using llvm::orc::ExecutorAddr; |
| 377 | + using llvm::orc::ExecutorSymbolDef; |
| 378 | + using llvm::orc::SymbolMap; |
| 379 | + |
| 380 | + SymbolMap syms; |
| 381 | + |
| 382 | + auto add = [&](const char *name, void *addr) { |
| 383 | + syms[interner(name)] = |
| 384 | + ExecutorSymbolDef::fromPtr(addr, JITSymbolFlags::Exported); |
| 385 | + }; |
| 386 | + |
| 387 | + // ---- ctx ---- |
| 388 | + add("cuda_shim_ctx_synchronize", (void *)&cuda_shim_ctx_synchronize); |
| 389 | + |
| 390 | + // ---- module ---- |
| 391 | + add("cuda_shim_load_module_from_image", |
| 392 | + (void *)&cuda_shim_load_module_from_image); |
| 393 | + add("cuda_shim_load_module_jit_from_image", |
| 394 | + (void *)&cuda_shim_load_module_jit_from_image); |
| 395 | + add("cuda_shim_load_module_from_file", |
| 396 | + (void *)&cuda_shim_load_module_from_file); |
| 397 | + add("cuda_shim_unload_module", (void *)&cuda_shim_unload_module); |
| 398 | + |
| 399 | + // ---- memory ---- |
| 400 | + add("cuda_shim_malloc", (void *)&cuda_shim_malloc); |
| 401 | + add("cuda_shim_free", (void *)&cuda_shim_free); |
| 402 | + |
| 403 | + // ---- memcpy ---- |
| 404 | + add("cuda_shim_memcpy_h2d", (void *)&cuda_shim_memcpy_h2d); |
| 405 | + add("cuda_shim_memcpy_d2h", (void *)&cuda_shim_memcpy_d2h); |
| 406 | + |
| 407 | + // ---- stream ---- |
| 408 | + add("cuda_shim_stream_create", (void *)&cuda_shim_stream_create); |
| 409 | + add("cuda_shim_stream_destroy", (void *)&cuda_shim_stream_destroy); |
| 410 | + add("cuda_shim_stream_synchronize", (void *)&cuda_shim_stream_synchronize); |
| 411 | + |
| 412 | + // ---- event ---- |
| 413 | + add("cuda_shim_event_create", (void *)&cuda_shim_event_create); |
| 414 | + add("cuda_shim_event_destroy", (void *)&cuda_shim_event_destroy); |
| 415 | + add("cuda_shim_event_record", (void *)&cuda_shim_event_record); |
| 416 | + add("cuda_shim_event_synchronize", (void *)&cuda_shim_event_synchronize); |
| 417 | + add("cuda_shim_stream_wait_event", (void *)&cuda_shim_stream_wait_event); |
| 418 | + |
| 419 | + // ---- launch ---- |
| 420 | + add("cuda_shim_launch_packed", (void *)&cuda_shim_launch_packed); |
| 421 | + add("cuda_shim_launch_block_packed", (void *)&cuda_shim_launch_block_packed); |
| 422 | + |
| 423 | + return syms; |
| 424 | +} |
| 425 | + |
| 426 | +static inline void registerCudaShimSymbols(mlir::ExecutionEngine &engine) { |
| 427 | + engine.registerSymbols([](llvm::orc::MangleAndInterner interner) { |
| 428 | + return buildCudaShimSymbolMap(interner); |
| 429 | + }); |
| 430 | +} |
0 commit comments