Added Matmul Toy Op

Alwaysproblem · Alwaysproblem · commit 2b310d3f3cf0 · 2025-12-29T14:46:38.000Z
diff --git a/mlir/cuda-tile/Toy/include/toy/Ops.td b/mlir/cuda-tile/Toy/include/toy/Ops.td
@@ -374,4 +374,33 @@ def TransposeOp : Toy_Op<"transpose",
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// MatMul Op
+//===----------------------------------------------------------------------===//
+
+def MatMulOp : Toy_Op<"matmul",
+    [Pure, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, MemoryEffectsOpInterface]> {
+  let summary = "matrix multiplication operation";
+  let description = [{
+    The "matmul" operation performs Matrix multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
+  let results = (outs Res<F64Tensor, "",
+                          [MemWrite<DefaultResource>,
+                           MemAlloc<DefaultResource>]>:$output);
+
+  let assemblyFormat = [{
+    `(` $lhs `:` type($lhs) `,` $rhs `:` type($rhs) `)` attr-dict `to` type(results)
+  }];
+
+  // Allow building a MatMulOp with from the two input operands.
+  let builders = [
+    OpBuilder<(ins "Value":$lhs, "Value":$rhs)>
+  ];
+
+  let hasVerifier = 1;
+}
+
 #endif // TOY_OPS
diff --git a/mlir/cuda-tile/Toy/mlir/Dialect.cpp b/mlir/cuda-tile/Toy/mlir/Dialect.cpp
@@ -437,6 +437,58 @@ llvm::LogicalResult TransposeOp::verify() {
   return mlir::success();
 }
 
+//===----------------------------------------------------------------------===//
+// MatMulOp
+//===----------------------------------------------------------------------===//
+
+void MatMulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                     mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder.getF64Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the MatMulOp, this is required by the shape
+/// inference interface.
+void MatMulOp::inferShapes() {
+  RankedTensorType lhsType =
+      llvm::dyn_cast<RankedTensorType>(getLhs().getType());
+  RankedTensorType rhsType =
+      llvm::dyn_cast<RankedTensorType>(getRhs().getType());
+  auto lhsShape = lhsType.getShape();
+  auto rhsShape = rhsType.getShape();
+  RankedTensorType res_type = RankedTensorType::get({lhsShape[0], rhsShape[1]},
+                                                    lhsType.getElementType());
+  getResult().setType(res_type);
+}
+
+llvm::LogicalResult MatMulOp::verify() {
+  auto lhsType = llvm::dyn_cast<RankedTensorType>(getLhs().getType());
+  auto rhsType = llvm::dyn_cast<RankedTensorType>(getRhs().getType());
+  auto resultType = llvm::dyn_cast<RankedTensorType>(getType());
+
+  if (!lhsType || !rhsType || !resultType)
+    return mlir::success();
+
+  auto lhsShape = lhsType.getShape();
+  auto rhsShape = rhsType.getShape();
+
+  if (lhsShape.size() != 2 || rhsShape.size() != 2) {
+    return emitOpError() << "expected 2D matrix";
+  }
+
+  if (lhsShape[1] != rhsShape[0]) {
+    return emitOpError() << "expected dimension to match"
+                         << "the shape of lhs is [" << lhsShape[0] << ", "
+                         << lhsShape[1] << "] "
+                         << "the shape of rhs is [" << rhsShape[0] << ", "
+                         << rhsShape[1] << "] "
+                         << "but the dimension " << lhsShape[1]
+                         << "!=" << rhsShape[0] << '\n';
+  }
+
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp b/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinDialect.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectRegistry.h"
@@ -31,9 +32,11 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
 #include <algorithm>
 #include <cstdint>
@@ -299,6 +302,94 @@ struct TransposeOpLowering : public OpConversionPattern<toy::TransposeOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: MatMul operations
+//===----------------------------------------------------------------------===//
+
+struct MatMulOpLowering : public ConversionPattern {
+  MatMulOpLowering(MLIRContext *ctx)
+      : ConversionPattern(toy::MatMulOp::getOperationName(), 1, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+
+    RankedTensorType lhsType =
+        llvm::dyn_cast<RankedTensorType>(op->getOperand(0).getType());
+    RankedTensorType rhsType =
+        llvm::dyn_cast<RankedTensorType>(op->getOperand(1).getType());
+    auto lhsShape = lhsType.getShape();
+    auto rhsShape = rhsType.getShape();
+
+    auto tensorType =
+        llvm::dyn_cast<RankedTensorType>((*op->result_type_begin()));
+
+    auto elemType = llvm::dyn_cast<FloatType>(tensorType.getElementType());
+
+    // Insert an allocation and deallocation for the result of this operation.
+    auto memRefType = convertTensorToMemRef(tensorType);
+    auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+    SmallVector<int64_t, 4> lowerBounds(tensorType.getRank() + 1, /*Value=*/0);
+    SmallVector<int64_t, 4> steps(tensorType.getRank() + 1, /*Value=*/1);
+    SmallVector<int64_t, 4> upperBounds{lhsShape[0], rhsShape[0], rhsShape[1]};
+
+    // add initialization of result tensor.
+    // Create a nest of affine loops to initialize the result tensor to 0.
+    affine::buildAffineLoopNest(
+        rewriter, loc, {0, 0}, tensorType.getShape(), {1, 1},
+        [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+          // Create a constant float value of 0.0.
+          auto valueToStore = arith::ConstantFloatOp::create(
+              nestedBuilder, loc, elemType,
+              llvm::APFloat::getZero(elemType.getFloatSemantics()));
+
+          // Store the constant value into the allocated memory.
+          affine::AffineStoreOp::create(nestedBuilder, loc, valueToStore, alloc,
+                                        ivs);
+        });
+
+    // Create a nest of affine loops for matrix multiplication.
+    affine::buildAffineLoopNest(
+        rewriter, loc, lowerBounds, upperBounds, steps,
+        [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+          // Extract loop induction variables.
+          Value m = ivs[0];
+          Value k = ivs[1];
+          Value n = ivs[2];
+
+          // Create an adaptor for the remapped operands of the MatMulOp.
+          toy::MatMulOpAdaptor matmulAdaptor(operands);
+
+          // Load elements from the left-hand side and right-hand side matrices.
+          auto loadedLhs = affine::AffineLoadOp::create(
+              nestedBuilder, loc, matmulAdaptor.getLhs(), ValueRange{m, k});
+
+          auto loadedRhs = affine::AffineLoadOp::create(
+              nestedBuilder, loc, matmulAdaptor.getRhs(), ValueRange{k, n});
+          // Load elements from the result tensor from initial process above.
+          auto loadedRes = affine::AffineLoadOp::create(
+              nestedBuilder, loc, alloc, ValueRange{m, n});
+
+          // Perform the multiplication and addition operations.
+          auto mulop =
+              arith::MulFOp::create(nestedBuilder, loc, loadedLhs, loadedRhs);
+          auto valueToStore =
+              arith::AddFOp::create(nestedBuilder, loc, loadedRes, mulop);
+
+          // Store the result back into the allocated memory.
+          affine::AffineStoreOp::create(nestedBuilder, loc, valueToStore, alloc,
+                                        ValueRange{m, n});
+        });
+
+    // Replace this operation with the generated alloc.
+    rewriter.replaceOp(op, alloc);
+
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -350,8 +441,8 @@ void ToyToAffineLoweringPass::runOnOperation() {
   // the set of patterns that will lower the Toy operations.
   RewritePatternSet patterns(&getContext());
   patterns.add<AddOpLowering, ConstantOpLowering, FuncOpLowering, MulOpLowering,
-               PrintOpLowering, ReturnOpLowering, TransposeOpLowering>(
-      &getContext());
+               PrintOpLowering, ReturnOpLowering, TransposeOpLowering,
+               MatMulOpLowering>(&getContext());
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
diff --git a/mlir/cuda-tile/Toy/mlir/MLIRGen.cpp b/mlir/cuda-tile/Toy/mlir/MLIRGen.cpp
@@ -331,6 +331,15 @@ class MLIRGenImpl {
       return TransposeOp::create(builder, location, operands[0]);
     }
 
+    if (callee == "matmul") {
+      if (call.getArgs().size() != 2) {
+        emitError(location, "MLIR codegen encountered an error: toy.matmul "
+                            "expected 2 arguments");
+        return nullptr;
+      }
+      return MatMulOp::create(builder, location, operands[0], operands[1]);
+    }
+
     // Otherwise this is a call to a user-defined function. Calls to
     // user-defined functions are mapped to a custom call that takes the callee
     // name as an attribute.
diff --git a/mlir/cuda-tile/sample/matmul.toy b/mlir/cuda-tile/sample/matmul.toy
@@ -0,0 +1,14 @@
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+
+  # b is identical to a, the literal tensor is implicitly reshaped: defining new
+  # variables is the way to reshape tensors (element count must match).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # transpose() and print() are the only builtin, the following will transpose
+  # a and b and perform an element-wise multiplication before printing the result.
+  # print(a * b + b);
+  print(matmul(a, transpose(b)));
+}
diff --git a/mlir/cuda-tile/sample/matmul.toy.mlir b/mlir/cuda-tile/sample/matmul.toy.mlir
@@ -0,0 +1,16 @@
+toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+  %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64>
+  toy.return %2 : tensor<*xf64>
+}
+
+toy.func @main() {
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64>
+  %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64>
+  toy.print %4 : tensor<*xf64>
+  toy.return
+}