@@ -188,24 +188,18 @@ struct EmbedCudaTileBinaryPass
188188 }
189189 auto binBytes = *binBytesOrErr;
190190
191- // ---- Step E: embed binary as LaunchGpuOp attributes ----
192- llvm::SmallVector<uint8_t , 0 > binU8Bytes;
193- binU8Bytes.reserve (binBytes.size ());
194- for (auto b : binBytes)
195- binU8Bytes.push_back (static_cast <uint8_t >(b));
196-
197- auto byteAttr = mlir::DenseIntElementsAttr::get (
198- mlir::RankedTensorType::get ({static_cast <int64_t >(binU8Bytes.size ())},
199- mlir::IntegerType::get (ctx, 8 )),
200- binU8Bytes);
201-
202- // launchOp->setAttr("cuda_binary", byteAttr);
203- launchOp->setAttr (" cuda_binary_size" ,
204- mlir::IntegerAttr::get (mlir::IntegerType::get (ctx, 64 ),
205- binU8Bytes.size ()));
191+ // ---- Step E: embed binary metadata as LaunchGpuOp attributes ----
192+ // Note: we currently only attach metadata (size/path/arch), not the
193+ // raw cuda_binary bytes themselves.
194+
195+ launchOp->setDiscardableAttr (
196+ " cuda_binary_size" ,
197+ mlir::IntegerAttr::get (mlir::IntegerType::get (ctx, 64 ),
198+ static_cast <int64_t >(binBytes.size ())));
206199 launchOp->setDiscardableAttr (
207200 " cuda_binary_path" , mlir::StringAttr::get (ctx, cudaBinPath.str ()));
208- launchOp->setAttr (" cuda_arch" , mlir::StringAttr::get (ctx, gpuName));
201+ launchOp->setDiscardableAttr (" cuda_arch" ,
202+ mlir::StringAttr::get (ctx, gpuName));
209203 });
210204
211205 // ---- Step F: Delete the cuda_tile.module ops ----
0 commit comments