@@ -412,11 +412,13 @@ memref::GlobalOp createGlobalForStringAttr(mlir::PatternRewriter &rewriter,
412412 std::vector<uint8_t > bytes (str.begin (), str.end ());
413413 bytes.push_back (0 );
414414
415+ auto type = RankedTensorType::get ({(int64_t )bytes.size ()},
416+ rewriter.getIntegerType (8 ));
417+
415418 auto memrefType =
416419 MemRefType::get ({(int64_t )bytes.size ()}, rewriter.getIntegerType (8 ));
417420
418- auto denseAttr =
419- DenseElementsAttr::get (memrefType, llvm::ArrayRef<uint8_t >(bytes));
421+ auto denseAttr = DenseElementsAttr::get (type, llvm::ArrayRef<uint8_t >(bytes));
420422
421423 auto global = memref::GlobalOp::create (
422424 rewriter, loc, sym_name,
@@ -428,6 +430,22 @@ memref::GlobalOp createGlobalForStringAttr(mlir::PatternRewriter &rewriter,
428430 return global;
429431}
430432
433+ arith::IndexCastOp getIndexFromGlobalMemref (mlir::PatternRewriter &rewriter,
434+ Location loc,
435+ memref::GlobalOp global) {
436+ auto getGlobalOp = memref::GetGlobalOp::create (
437+ rewriter, loc, global->getResult (0 ).getType (), global.getName ());
438+ memref::ExtractAlignedPointerAsIndexOp extractOp =
439+ memref::ExtractAlignedPointerAsIndexOp::create (
440+ rewriter, loc, rewriter.getIndexType (), getGlobalOp.getResult ());
441+
442+ auto globalType = llvm::cast<MemRefType>(global.getType ());
443+ auto size = globalType.getShape ()[0 ];
444+ auto sizeValue = rewriter.create <arith::ConstantIndexOp>(loc, size);
445+ return rewriter.create <arith::IndexCastOp>(loc, rewriter.getI64Type (),
446+ sizeValue);
447+ }
448+
431449struct LanchGpuLowering : public ConversionPattern {
432450 LanchGpuLowering (MLIRContext *ctx)
433451 : ConversionPattern(toy::LaunchGpuOp::getOperationName(), 1 , ctx) {}
@@ -446,6 +464,33 @@ struct LanchGpuLowering : public ConversionPattern {
446464 }
447465 }
448466
467+ // %3 = toy.launch_gpu @outlined_gpu_kernel_0(%0, %2, %1) {cuda_arch =
468+ // "sm_120", cuda_binary_path = "/tmp/cuda_tile-d7f3fd.bin",
469+ // cuda_binary_size = 10112 : i64, grid = array<i64: 1, 1, 1>} :
470+ // (tensor<2x4xf32>, tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
471+ // the `%3` is the output value of the launch op, `%0`, `%2`, `%1` are the
472+ // operands to be passed to the GPU kernel, and the attributes are the
473+ // launch configuration for the GPU kernel.
474+ // so we need to create the output tensor since the cuda tile entry op
475+ // will take the output tensor as argument instead of return value.
476+ // and as we did before, the last operand is the output tensor, and the rest
477+ // are input tensors.
478+ // moreover, we assume the memory life time of the output tensor is the
479+ // whole main function, so we can just allocate it at the beginning of the
480+ // main function and pass it to the cuda tile entry op which means we won't
481+ // promote the deallocation of the output tensor from the last op of the
482+ // block to the end of use.
483+
484+ auto outputType = llvm::cast<RankedTensorType>(
485+ launchGpuOp->getResults ().front ().getType ());
486+
487+ auto outputMemRefType = convertTensorToMemRef (outputType);
488+ auto outputTensorAlloc =
489+ insertAllocAndDealloc (outputMemRefType, loc, rewriter);
490+
491+ // extract the `cuda_binary_path` attribute from the launch op, and create a
492+ // global memref for it, which will be used in the cuda tile entry op to
493+ // load the cuda binary.
449494 auto cudaBinaryPathAttr =
450495 launchGpuOp->getDiscardableAttr (" cuda_binary_path" );
451496 if (!cudaBinaryPathAttr) {
@@ -467,15 +512,29 @@ struct LanchGpuLowering : public ConversionPattern {
467512 auto kernel_name_memref = createGlobalForStringAttr (
468513 rewriter, launchGpuOp, " kname" , rewriter.getStringAttr (kernelName));
469514
470- auto nbytesVal = arith::ConstantIndexOp::create (rewriter, loc, 1 );
471- auto streamVal = arith::ConstantIndexOp::create (rewriter, loc, 0 );
515+ // load the cuda binary path from the global memref.
516+ auto cuda_blob_loaded = memref::GetGlobalOp::create (
517+ rewriter, loc, cuda_blob_memref->getResult (0 ).getType (), " cuda_blob" );
518+
519+ auto kname_loaded = memref::GetGlobalOp::create (
520+ rewriter, loc, kernel_name_memref->getResult (0 ).getType (), " kname" );
521+
522+ // handle the input of the launch op, we will create a cuda allocation for
523+ // each input tensor.
524+ for (auto operand : launchGpuOp->getOperands ()) {
525+ auto ranked_tensor_type = llvm::cast<RankedTensorType>(operand.getType ());
526+ auto shape = ranked_tensor_type.getShape ();
527+ }
528+
529+ auto nbytesVal = arith::ConstantIntOp::create (rewriter, loc, 1 , 64 );
530+ auto streamVal = arith::ConstantIntOp::create (rewriter, loc, 0 , 64 );
472531 auto isHostSharedVal = arith::ConstantIntOp::create (rewriter, loc, 0 , 1 );
473532
474533 auto callee =
475534 registry.call (rewriter, launchGpuOp, CudaShimFn::Malloc,
476535 ValueRange{nbytesVal, streamVal, isHostSharedVal});
477536
478- rewriter.replaceOp (op, callee );
537+ rewriter.replaceOp (op, outputTensorAlloc );
479538 return success ();
480539 }
481540};
0 commit comments