添加异步局部调度实现及相关测试

Alwaysproblem · Alwaysproblem · commit 676f9b52533e · 2026-03-13T10:37:07.000Z
diff --git a/mlir/optimization/scheduler/CMakeLists.txt b/mlir/optimization/scheduler/CMakeLists.txt
@@ -17,6 +17,7 @@ add_executable(
   lib/MemrefLifetime.cpp
   lib/FusionFeasibility.cpp
   lib/LivenessAdapter.cpp
+  lib/LocalListScheduling.cpp
   )
 
 # add_dependencies(lab-scheduler ToyCh6ShapeInferenceInterfaceIncGen
diff --git a/mlir/optimization/scheduler/include/lab/LabPasses.h b/mlir/optimization/scheduler/include/lab/LabPasses.h
@@ -15,5 +15,6 @@ std::unique_ptr<Pass> createLabPipelinePlanPass();
 std::unique_ptr<Pass> createLabLivenessPass();
 std::unique_ptr<Pass> createLabMemrefLifetimePass();
 std::unique_ptr<Pass> createLabFusionFeasibilityPass();
+std::unique_ptr<Pass> createAsyncLocalSchedulePass();
 
 } // namespace mlir
diff --git a/mlir/optimization/scheduler/lab-opt.cpp b/mlir/optimization/scheduler/lab-opt.cpp
@@ -1,6 +1,7 @@
 #include "lab/LabPasses.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -19,7 +20,8 @@ int main(int argc, char **argv) {
   registry.insert<mlir::func::FuncDialect, mlir::linalg::LinalgDialect,
                   mlir::arith::ArithDialect, mlir::tensor::TensorDialect,
                   mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
-                  mlir::affine::AffineDialect, mlir::cf::ControlFlowDialect>();
+                  mlir::affine::AffineDialect, mlir::cf::ControlFlowDialect,
+                  mlir::async::AsyncDialect>();
 
   mlir::registerAllPasses();
   mlir::PassPipelineRegistration<>("lab-op-stats", "Lab Op Stats Pass",
@@ -45,6 +47,12 @@ int main(int argc, char **argv) {
         pm.addPass(mlir::createLabFusionFeasibilityPass());
       });
 
+  mlir::PassPipelineRegistration<>(
+      "lab-async-local-schedule", "Lab Async Local Schedule Pass",
+      [](mlir::OpPassManager &pm) {
+        pm.addPass(mlir::createAsyncLocalSchedulePass());
+      });
+
   return mlir::asMainReturnCode(
       mlir::MlirOptMain(argc, argv, "Lab optimizer\n", registry));
 }
diff --git a/mlir/optimization/scheduler/lib/LocalListScheduling.cpp b/mlir/optimization/scheduler/lib/LocalListScheduling.cpp
@@ -0,0 +1,308 @@
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Async/IR/Async.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace mlir;
+
+namespace {
+
+enum class NodeKind {
+  AsyncExecute,
+  AsyncAwait,
+  PureCompute,
+  BarrierLike,
+  Other
+};
+
+struct SchedNode {
+  Operation *op = nullptr;
+  NodeKind kind = NodeKind::Other;
+  SmallVector<int> preds;
+  SmallVector<int> succs;
+  int indegree = 0;
+  int originalOrder = -1;
+};
+
+static bool isAsyncType(Type ty) {
+  return isa<async::TokenType>(ty) || isa<async::ValueType>(ty);
+}
+
+static NodeKind classifyOp(Operation *op) {
+  if (isa<async::ExecuteOp>(op))
+    return NodeKind::AsyncExecute;
+  if (isa<async::AwaitOp>(op))
+    return NodeKind::AsyncAwait;
+
+  // terminator / region branch 直接看作 barrier
+  if (op->hasTrait<OpTrait::IsTerminator>())
+    return NodeKind::BarrierLike;
+
+  // 无 side effect 的普通算子，视作纯计算
+  if (isMemoryEffectFree(op))
+    return NodeKind::PureCompute;
+
+  return NodeKind::Other;
+}
+
+static bool isBarrier(Operation *op) {
+  if (op->hasTrait<OpTrait::IsTerminator>())
+    return true;
+
+  // async.execute / async.await 本身不是 barrier
+  if (isa<async::ExecuteOp, async::AwaitOp>(op))
+    return false;
+
+  // 纯 op 允许参与窗口调度
+  if (isMemoryEffectFree(op))
+    return false;
+
+  // 其余统统保守视为 barrier
+  return true;
+}
+
+static DenseMap<Operation *, int> buildOpIndex(ArrayRef<Operation *> ops) {
+  DenseMap<Operation *, int> map;
+  for (auto [i, op] : llvm::enumerate(ops))
+    map[op] = i;
+  return map;
+}
+
+static void addEdge(SmallVectorImpl<SchedNode> &nodes, int u, int v) {
+  if (u == v)
+    return;
+
+  // 避免重复边
+  if (llvm::is_contained(nodes[u].succs, v))
+    return;
+
+  nodes[u].succs.push_back(v);
+  nodes[v].preds.push_back(u);
+}
+
+static void buildSSADependencies(ArrayRef<Operation *> ops,
+                                 SmallVectorImpl<SchedNode> &nodes) {
+  auto opToIdx = buildOpIndex(ops);
+
+  for (auto [i, op] : llvm::enumerate(ops)) {
+    for (Value operand : op->getOperands()) {
+      Operation *def = operand.getDefiningOp();
+      if (!def)
+        continue;
+
+      auto it = opToIdx.find(def);
+      if (it == opToIdx.end())
+        continue;
+
+      addEdge(nodes, it->second, i);
+    }
+  }
+}
+
+static bool needsConservativeOrder(Operation *a, Operation *b) {
+  bool pureA = isMemoryEffectFree(a);
+  bool pureB = isMemoryEffectFree(b);
+
+  // 两个都纯，则不需要额外约束
+  if (pureA && pureB)
+    return false;
+
+  // async.execute / async.await 与纯 op 混排时，第一版我们也允许
+  // 只要它们的 SSA 依赖满足即可。
+  if ((isa<async::ExecuteOp, async::AwaitOp>(a) || pureA) &&
+      (isa<async::ExecuteOp, async::AwaitOp>(b) || pureB)) {
+    return false;
+  }
+
+  // 其余情况保守约束
+  return true;
+}
+
+static void buildConservativeOrderEdges(ArrayRef<Operation *> ops,
+                                        SmallVectorImpl<SchedNode> &nodes) {
+  for (int i = 0, e = static_cast<int>(ops.size()); i < e; ++i) {
+    for (int j = i + 1; j < e; ++j) {
+      if (needsConservativeOrder(ops[i], ops[j]))
+        addEdge(nodes, i, j);
+    }
+  }
+}
+
+static int priorityOf(NodeKind kind) {
+  switch (kind) {
+  case NodeKind::AsyncExecute:
+    return 300;
+  case NodeKind::PureCompute:
+    return 200;
+  case NodeKind::AsyncAwait:
+    return 100;
+  case NodeKind::Other:
+    return 50;
+  case NodeKind::BarrierLike:
+    return 0;
+  }
+  return 0;
+}
+
+static SmallVector<int> scheduleWindow(ArrayRef<SchedNode> inputNodes) {
+  SmallVector<SchedNode> nodes(inputNodes.begin(), inputNodes.end());
+
+  for (auto &n : nodes)
+    n.indegree = static_cast<int>(n.preds.size());
+
+  SmallVector<int> ready;
+  for (int i = 0, e = static_cast<int>(nodes.size()); i < e; ++i) {
+    if (nodes[i].indegree == 0)
+      ready.push_back(i);
+  }
+
+  SmallVector<int> order;
+  order.reserve(nodes.size());
+
+  while (!ready.empty()) {
+    int bestPos = 0;
+    for (int k = 1, e = static_cast<int>(ready.size()); k < e; ++k) {
+      int lhs = ready[k];
+      int rhs = ready[bestPos];
+
+      int pl = priorityOf(nodes[lhs].kind);
+      int pr = priorityOf(nodes[rhs].kind);
+
+      if (pl > pr) {
+        bestPos = k;
+        continue;
+      }
+      if (pl == pr && nodes[lhs].originalOrder < nodes[rhs].originalOrder) {
+        bestPos = k;
+      }
+    }
+
+    int u = ready[bestPos];
+    ready.erase(ready.begin() + bestPos);
+    order.push_back(u);
+
+    for (int v : nodes[u].succs) {
+      nodes[v].indegree--;
+      if (nodes[v].indegree == 0)
+        ready.push_back(v);
+    }
+  }
+
+  if (order.size() != nodes.size())
+    return {}; // 有环，放弃该窗口
+
+  return order;
+}
+
+static SmallVector<SmallVector<Operation *>> collectWindows(Block &block) {
+  SmallVector<SmallVector<Operation *>> windows;
+  SmallVector<Operation *> current;
+
+  for (Operation &op : block) {
+    if (isBarrier(&op)) {
+      if (!current.empty()) {
+        windows.push_back(std::move(current));
+        current.clear();
+      }
+      continue;
+    }
+    current.push_back(&op);
+  }
+
+  if (!current.empty())
+    windows.push_back(std::move(current));
+
+  return windows;
+}
+
+static bool reorderWindow(ArrayRef<Operation *> ops) {
+  if (ops.size() < 2)
+    return false;
+
+  SmallVector<SchedNode> nodes;
+  nodes.reserve(ops.size());
+
+  for (auto [i, op] : llvm::enumerate(ops)) {
+    nodes.push_back(SchedNode{
+        .op = op,
+        .kind = classifyOp(op),
+        .preds = {},
+        .succs = {},
+        .indegree = 0,
+        .originalOrder = static_cast<int>(i),
+    });
+  }
+
+  buildSSADependencies(ops, nodes);
+  buildConservativeOrderEdges(ops, nodes);
+
+  SmallVector<int> newOrder = scheduleWindow(nodes);
+  if (newOrder.empty())
+    return false;
+
+  bool changed = false;
+  for (int i = 0, e = static_cast<int>(newOrder.size()); i < e; ++i) {
+    if (newOrder[i] != i) {
+      changed = true;
+      break;
+    }
+  }
+  if (!changed)
+    return false;
+
+  // 锚点：窗口结束位置（最后一个 op 的 next）
+  Operation *afterWindow = ops.back()->getNextNode();
+
+  for (int idx : newOrder) {
+    Operation *op = nodes[idx].op;
+    if (afterWindow)
+      op->moveBefore(afterWindow);
+    else
+      op->moveBefore(op->getBlock(), Block::iterator());
+  }
+
+  return true;
+}
+
+struct AsyncLocalSchedulePass
+    : public PassWrapper<AsyncLocalSchedulePass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AsyncLocalSchedulePass)
+
+  StringRef getArgument() const final { return "lab-async-local-schedule"; }
+  StringRef getDescription() const final {
+    return "Locally reorder async.execute/async.await inside a block";
+  }
+
+  void runOnOperation() override;
+};
+
+void AsyncLocalSchedulePass::runOnOperation() {
+  func::FuncOp func = getOperation();
+
+  bool changed = false;
+  for (Block &block : func.getBody()) {
+    auto windows = collectWindows(block);
+    for (auto &window : windows) {
+      changed |= reorderWindow(window);
+    }
+  }
+
+  (void)changed;
+}
+
+} // namespace
+
+namespace mlir {
+std::unique_ptr<Pass> createAsyncLocalSchedulePass() {
+  return std::make_unique<AsyncLocalSchedulePass>();
+}
+} // namespace mlir
diff --git a/mlir/optimization/scheduler/tests/async_reorder.mlir b/mlir/optimization/scheduler/tests/async_reorder.mlir
@@ -0,0 +1,14 @@
+func.func @test(%c0 : i32, %c1 : i32) -> i32 {
+  %token0, %t0 = async.execute -> !async.value<i32> {
+    async.yield %c0 : i32
+  }
+  %v0 = async.await %t0 : !async.value<i32>
+
+  %token1, %t1 = async.execute -> !async.value<i32> {
+    async.yield %c1 : i32
+  }
+  %v1 = async.await %t1 : !async.value<i32>
+
+  %sum = arith.addi %v0, %v1 : i32
+  return %sum : i32
+}
diff --git a/mlir/optimization/scheduler/tests/async_reorder_complex.mlir b/mlir/optimization/scheduler/tests/async_reorder_complex.mlir

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ add_executable(`
`17`	`17`	`lib/MemrefLifetime.cpp`
`18`	`18`	`lib/FusionFeasibility.cpp`
`19`	`19`	`lib/LivenessAdapter.cpp`
	`20`	`+ lib/LocalListScheduling.cpp`
`20`	`21`	`)`
`21`	`22`
`22`	`23`	`# add_dependencies(lab-scheduler ToyCh6ShapeInferenceInterfaceIncGen`