Added the EntryOp for cuda tile IR

Alwaysproblem · Alwaysproblem · commit 60b8083ffc73 · 2026-01-03T00:54:44.000Z
diff --git a/mlir/cuda-tile/CMakeLists.txt b/mlir/cuda-tile/CMakeLists.txt
@@ -20,11 +20,17 @@ message(STATUS "MLIR_INCLUDE_DIR include dir: ${MLIR_INCLUDE_DIR}")
 # This is for non-conda users. 
 find_package(LLVM CONFIG PATHS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/lib/cmake/llvm) 
 find_package(MLIR CONFIG PATHS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/lib/cmake/mlir)
-
+find_package(CUDAToolkit REQUIRED)
 # set(MLIR_TABLEGEN_EXE ${CMAKE_CURRENT_SOURCE_DIR}/third_party/bin/mlir-tblgen)
+message(STATUS "CUDA Toolkit found: ${CUDAToolkit_INCLUDE_DIRS}")
+message(STATUS "CUDA_TILE_SOURCE_DIR include dir: ${CUDA_TILE_SOURCE_DIR}")
+message(STATUS "CUDA_TILE_BINARY_DIR include dir: ${CUDA_TILE_BINARY_DIR}")
 
 include_directories(${LLVM_INCLUDE_DIR})
 include_directories(${MLIR_INCLUDE_DIR})
+include_directories(${CUDAToolkit_INCLUDE_DIRS})
+include_directories(${CUDA_TILE_SOURCE_DIR}/include)
+include_directories(${CUDA_TILE_BINARY_DIR}/include)
 
 include(LLVMDistributionSupport)
 include(TableGen)
diff --git a/mlir/cuda-tile/Toy/CMakeLists.txt b/mlir/cuda-tile/Toy/CMakeLists.txt
@@ -1,7 +1,3 @@
-find_package(CUDAToolkit REQUIRED)
-
-message(STATUS "CUDA Toolkit found: ${CUDAToolkit_INCLUDE_DIRS}")
-
 # For a better template to copy, see examples/standalone
 include_directories(include)
 add_subdirectory(include)
@@ -30,6 +26,7 @@ add_executable(
   mlir/ShapeInferencePass.cpp
   mlir/ToyCombine.cpp
   mlir/LowerToGpu.cpp
+  mlir/LowerToCudaTile.cpp
   )
 
 add_dependencies(toy-cuda
@@ -62,5 +59,6 @@ target_link_libraries(toy-cuda
     MLIRSupport
     MLIRTargetLLVMIRExport
     MLIRTransforms
+    CudaTileDialect
     cuda_shim
   )
diff --git a/mlir/cuda-tile/Toy/include/toy/Passes.h b/mlir/cuda-tile/Toy/include/toy/Passes.h
@@ -32,6 +32,8 @@ std::unique_ptr<mlir::Pass> createLowerToLLVMPass();
 
 std::unique_ptr<mlir::Pass> createGpuOutlinePass(std::string grid="1,1,1");
 
+std::unique_ptr<mlir::Pass> createCudaTileLoweringPass();
+
 } // namespace toy
 } // namespace mlir
 
diff --git a/mlir/cuda-tile/Toy/mlir/LowerToCudaTile.cpp b/mlir/cuda-tile/Toy/mlir/LowerToCudaTile.cpp
@@ -0,0 +1,133 @@
+#include "cuda_tile/Dialect/CudaTile/IR/Types.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+
+#include "cuda_tile/Dialect/CudaTile/IR/Dialect.h"
+#include "cuda_tile/Dialect/CudaTile/IR/Ops.h"
+
+#include <memory>
+#include <string>
+
+#define DEBUG_TYPE "toy-to-cuda-tile"
+
+//===----------------------------------------------------------------------===//
+// ToyToCudaTileLoweringPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct ToyToCudaTileLoweringPass
+    : public mlir::PassWrapper<ToyToCudaTileLoweringPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToCudaTileLoweringPass)
+
+  llvm::StringRef getArgument() const override { return "toy-to-cuda-tile"; }
+
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::cuda_tile::CudaTileDialect>();
+  }
+
+  void runOnOperation() final;
+};
+}; // namespace
+
+mlir::cuda_tile::ModuleOp createCudaModuleOp(mlir::OpBuilder &builder,
+                                             mlir::ModuleOp &moduleOp) {
+  mlir::OpBuilder::InsertionGuard guard(builder);
+
+  builder.setInsertionPoint(moduleOp.getBody(), moduleOp.getBody()->end());
+  auto cudaTileModuleOp = mlir::cuda_tile::ModuleOp::create(
+      builder, moduleOp.getLoc(), "cuda_tile_module");
+
+  LDBG() << "Created CudaTile Module: \n" << cudaTileModuleOp << "\n";
+  return cudaTileModuleOp;
+}
+
+void ToyToCudaTileLoweringPass::runOnOperation() {
+  auto moduleOp = getOperation();
+
+  // Here we would implement the actual lowering logic from Toy GPUFuncOp
+  // to CudaTile operations. For now, we just log that the pass is running.
+  // LDBG() << "Running Toy to CudaTile lowering on GPUFuncOp: " << moduleOp
+  //        << "\n";
+
+  mlir::OpBuilder builder(moduleOp.getContext());
+  // 1. Create new cuda_tile.module Op in the last section.
+  auto cudaTileModuleOp = createCudaModuleOp(builder, moduleOp);
+  // mlir::SymbolTable cudaTileSymbolTable(cudaTileModuleOp);
+
+  moduleOp->walk([&](mlir::toy::GPUFuncOp gfunOp) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    // setInsertionPointToEnd expects a Block*, so take the address of the
+    // single block inside the cuda_tile.module region.
+    builder.setInsertionPointToEnd(&cudaTileModuleOp.getBodyRegion().front());
+    auto gfunc_name =
+        gfunOp->getAttrOfType<mlir::StringAttr>("sym_name").getValue();
+    llvm::SmallVector<mlir::Type, 8> newArgTypes;
+
+    LDBG() << "Lowering GPU function: " << gfunc_name << "\n";
+    LDBG() << "Converting input type into cuda tile type" << "\n";
+
+    for (mlir::Type t : gfunOp.getFunctionType().getInputs()) {
+      LDBG() << "Original arg type: " << t << "\n";
+      auto tt = llvm::dyn_cast<mlir::TensorType>(t);
+      auto elemType = tt.getElementType();
+      auto ptrElem = mlir::cuda_tile::PointerType::get(elemType);
+      auto newType = mlir::cuda_tile::TileType::get({}, ptrElem);
+      LDBG() << "The new arg type for cuda tile: " << newType << "\n";
+      newArgTypes.push_back(newType);
+    }
+
+    LDBG() << "Converting result type into cuda tile type" << "\n";
+    for (mlir::Type t : gfunOp.getFunctionType().getResults()) {
+      LDBG() << "Original result type: " << t << "\n";
+      auto tt = llvm::dyn_cast<mlir::TensorType>(t);
+      auto elemType = tt.getElementType();
+      auto ptrElem = mlir::cuda_tile::PointerType::get(elemType);
+      auto newType = mlir::cuda_tile::TileType::get({}, ptrElem);
+      LDBG() << "The new arg type for cuda tile: " << newType << "\n";
+      newArgTypes.push_back(newType);
+    }
+
+    auto newFnType = builder.getFunctionType(newArgTypes, {});
+    auto fname = builder.getStringAttr(gfunc_name);
+    auto argTypes = builder.getTypeArrayAttr(newArgTypes);
+    auto cudaEntryOp = mlir::cuda_tile::EntryOp::create(
+        builder, gfunOp.getLoc(), fname, newFnType,
+        /*arg_attrs=*/{}, /*res_attrs=*/{}, {});
+    auto bb = cudaEntryOp.addEntryBlock();
+    builder.setInsertionPointToStart(bb);
+    auto retOp = mlir::cuda_tile::ReturnOp::create(builder, gfunOp.getLoc());
+
+    LDBG() << "Created CudaTile Entry Op: \n" << cudaEntryOp << "\n";
+  });
+}
+
+namespace mlir::toy {
+
+std::unique_ptr<mlir::Pass> createCudaTileLoweringPass() {
+  return std::make_unique<ToyToCudaTileLoweringPass>();
+};
+
+}; // namespace mlir::toy
diff --git a/mlir/cuda-tile/Toy/toyc.cpp b/mlir/cuda-tile/Toy/toyc.cpp
@@ -333,7 +333,8 @@ static int loadAndProcessMLIRGPU(mlir::MLIRContext &context,
 
   // Now process the toy mlir with gpu outline pass.
   optPM.addPass(mlir::toy::createGpuOutlinePass(assignGrid));
-  // pm.addPass(mlir::toy::createCudaTileLoweringPass());
+  // mlir::OpPassManager &gpuOptPM = pm.nest<mlir::toy::GPUFuncOp>();
+  pm.addPass(mlir::toy::createCudaTileLoweringPass());
   // pm.addPass(mlir::toy::createLowerGpuHostToLLVMPass());
   // bool isLoweringToAffine = emitAction >= Action::DumpGpuAffine;
   // if (isLoweringToAffine) {
diff --git a/mlir/cuda-tile/explore/outlined.mlir b/mlir/cuda-tile/explore/outlined.mlir
@@ -0,0 +1,22 @@
+module {
+  toy.func @main() {
+    %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
+    %1 = toy.constant dense<[[1.100000e+01, 1.200000e+01, 1.300000e+01], [1.400000e+01, 1.500000e+01, 1.600000e+01]]> : tensor<2x3xf32>
+    %2 = toy.launch_gpu @outlined_gpu_kernel_0(%1, %0) {grid = array<i64: 4, 2, 1>} : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x2xf32>
+    toy.print %2 : tensor<2x2xf32>
+    %3 = toy.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00], [1.000000e+01, 1.100000e+01, 1.200000e+01]]> : tensor<2x3xf32>
+    %4 = toy.launch_gpu @outlined_gpu_kernel_1(%0, %3, %1) {grid = array<i64: 4, 2, 1>} : (tensor<2x3xf32>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+    toy.print %4 : tensor<2x3xf32>
+    toy.return
+  }
+  toy.gpu_func @outlined_gpu_kernel_0(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<2x2xf32> {
+    %0 = toy.transpose(%arg0 : tensor<2x3xf32>) to tensor<3x2xf32>
+    %1 = toy.matmul(%arg1 : tensor<2x3xf32>, %0 : tensor<3x2xf32>) to tensor<2x2xf32>
+    toy.return %1 : tensor<2x2xf32>
+  }
+  toy.gpu_func @outlined_gpu_kernel_1(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xf32>) -> tensor<2x3xf32> {
+    %0 = toy.mul %arg0, %arg1 : tensor<2x3xf32>
+    %1 = toy.add %0, %arg2 : tensor<2x3xf32>
+    toy.return %1 : tensor<2x3xf32>
+  }
+}
diff --git a/mlir/cuda-tile/scripts/build_cuda_tile.sh b/mlir/cuda-tile/scripts/build_cuda_tile.sh
@@ -17,7 +17,7 @@ cmake -G Ninja -S ${WORKSPACEROOT}/third_party/cuda-tile -B build \
   -DLLVM_ENABLE_ASSERTIONS=OFF \
   -DCUDA_TILE_ENABLE_BINDINGS_PYTHON=OFF \
   -DCUDA_TILE_ENABLE_TESTING=OFF \
-  -DCUDA_TILE_INSTALL_DIR=${WORKSPACEROOT}/third_party/cuda \
+  -DCMAKE_INSTALL_PREFIX=${WORKSPACEROOT}/third_party/cuda \
   -DCUDA_TILE_USE_LLVM_INSTALL_DIR=${WORKSPACEROOT}/third_party/llvm
 
 cmake --build build