Skip to content

Commit 5007d48

Browse files
committed
sync
1 parent 034cc87 commit 5007d48

1 file changed

Lines changed: 64 additions & 5 deletions

File tree

mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -412,11 +412,13 @@ memref::GlobalOp createGlobalForStringAttr(mlir::PatternRewriter &rewriter,
412412
std::vector<uint8_t> bytes(str.begin(), str.end());
413413
bytes.push_back(0);
414414

415+
auto type = RankedTensorType::get({(int64_t)bytes.size()},
416+
rewriter.getIntegerType(8));
417+
415418
auto memrefType =
416419
MemRefType::get({(int64_t)bytes.size()}, rewriter.getIntegerType(8));
417420

418-
auto denseAttr =
419-
DenseElementsAttr::get(memrefType, llvm::ArrayRef<uint8_t>(bytes));
421+
auto denseAttr = DenseElementsAttr::get(type, llvm::ArrayRef<uint8_t>(bytes));
420422

421423
auto global = memref::GlobalOp::create(
422424
rewriter, loc, sym_name,
@@ -428,6 +430,22 @@ memref::GlobalOp createGlobalForStringAttr(mlir::PatternRewriter &rewriter,
428430
return global;
429431
}
430432

433+
arith::IndexCastOp getIndexFromGlobalMemref(mlir::PatternRewriter &rewriter,
434+
Location loc,
435+
memref::GlobalOp global) {
436+
auto getGlobalOp = memref::GetGlobalOp::create(
437+
rewriter, loc, global->getResult(0).getType(), global.getName());
438+
memref::ExtractAlignedPointerAsIndexOp extractOp =
439+
memref::ExtractAlignedPointerAsIndexOp::create(
440+
rewriter, loc, rewriter.getIndexType(), getGlobalOp.getResult());
441+
442+
auto globalType = llvm::cast<MemRefType>(global.getType());
443+
auto size = globalType.getShape()[0];
444+
auto sizeValue = rewriter.create<arith::ConstantIndexOp>(loc, size);
445+
return rewriter.create<arith::IndexCastOp>(loc, rewriter.getI64Type(),
446+
sizeValue);
447+
}
448+
431449
struct LanchGpuLowering : public ConversionPattern {
432450
LanchGpuLowering(MLIRContext *ctx)
433451
: ConversionPattern(toy::LaunchGpuOp::getOperationName(), 1, ctx) {}
@@ -446,6 +464,33 @@ struct LanchGpuLowering : public ConversionPattern {
446464
}
447465
}
448466

467+
// %3 = toy.launch_gpu @outlined_gpu_kernel_0(%0, %2, %1) {cuda_arch =
468+
// "sm_120", cuda_binary_path = "/tmp/cuda_tile-d7f3fd.bin",
469+
// cuda_binary_size = 10112 : i64, grid = array<i64: 1, 1, 1>} :
470+
// (tensor<2x4xf32>, tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
471+
// the `%3` is the output value of the launch op, `%0`, `%2`, `%1` are the
472+
// operands to be passed to the GPU kernel, and the attributes are the
473+
// launch configuration for the GPU kernel.
474+
// so we need to create the output tensor since the cuda tile entry op
475+
// will take the output tensor as argument instead of return value.
476+
// and as we did before, the last operand is the output tensor, and the rest
477+
// are input tensors.
478+
// moreover, we assume the memory life time of the output tensor is the
479+
// whole main function, so we can just allocate it at the beginning of the
480+
// main function and pass it to the cuda tile entry op which means we won't
481+
// promote the deallocation of the output tensor from the last op of the
482+
// block to the end of use.
483+
484+
auto outputType = llvm::cast<RankedTensorType>(
485+
launchGpuOp->getResults().front().getType());
486+
487+
auto outputMemRefType = convertTensorToMemRef(outputType);
488+
auto outputTensorAlloc =
489+
insertAllocAndDealloc(outputMemRefType, loc, rewriter);
490+
491+
// extract the `cuda_binary_path` attribute from the launch op, and create a
492+
// global memref for it, which will be used in the cuda tile entry op to
493+
// load the cuda binary.
449494
auto cudaBinaryPathAttr =
450495
launchGpuOp->getDiscardableAttr("cuda_binary_path");
451496
if (!cudaBinaryPathAttr) {
@@ -467,15 +512,29 @@ struct LanchGpuLowering : public ConversionPattern {
467512
auto kernel_name_memref = createGlobalForStringAttr(
468513
rewriter, launchGpuOp, "kname", rewriter.getStringAttr(kernelName));
469514

470-
auto nbytesVal = arith::ConstantIndexOp::create(rewriter, loc, 1);
471-
auto streamVal = arith::ConstantIndexOp::create(rewriter, loc, 0);
515+
// load the cuda binary path from the global memref.
516+
auto cuda_blob_loaded = memref::GetGlobalOp::create(
517+
rewriter, loc, cuda_blob_memref->getResult(0).getType(), "cuda_blob");
518+
519+
auto kname_loaded = memref::GetGlobalOp::create(
520+
rewriter, loc, kernel_name_memref->getResult(0).getType(), "kname");
521+
522+
// handle the input of the launch op, we will create a cuda allocation for
523+
// each input tensor.
524+
for (auto operand : launchGpuOp->getOperands()) {
525+
auto ranked_tensor_type = llvm::cast<RankedTensorType>(operand.getType());
526+
auto shape = ranked_tensor_type.getShape();
527+
}
528+
529+
auto nbytesVal = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
530+
auto streamVal = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
472531
auto isHostSharedVal = arith::ConstantIntOp::create(rewriter, loc, 0, 1);
473532

474533
auto callee =
475534
registry.call(rewriter, launchGpuOp, CudaShimFn::Malloc,
476535
ValueRange{nbytesVal, streamVal, isHostSharedVal});
477536

478-
rewriter.replaceOp(op, callee);
537+
rewriter.replaceOp(op, outputTensorAlloc);
479538
return success();
480539
}
481540
};

0 commit comments

Comments
 (0)