Plotting updates & occupancy bug fix

tanner-andrulis · tanner-andrulis · commit d1c7199f4c2a · 2025-01-08T15:04:34.000-05:00
diff --git a/pytimeloop/fastfusion/mapper/simexplore.py b/pytimeloop/fastfusion/mapper/simexplore.py
@@ -33,21 +33,32 @@ def fuse_sims(sims: list[SIM], resource2capacity: dict=None, return_nmappings_nb
         nbuckets.append(len(s))
         nmappings.append(sum(len(s2.mapping.data) for s2 in s))
         next_and_prev_live_tensors = next_live_tensors | s[0].tensor_names
+        shared_tensors = set(s[0].tensor_names) & set(ns[0].tensor_names)
 
         first_ns = ns[0]
         ns = SIM.group_by_left(ns, s[0].tensor_names)
         s = SIM.group_by_right(s, first_ns.tensor_names, keep_loops=True)
-
+            
         for k, ns2 in ns.items():
             for ns3 in ns2:
-                ns3.consolidate(next_live_tensors, resource2capacity)
+                ns3.consolidate(next_live_tensors, resource2capacity, shared_tensors)
             ns[k] = SIM.combine_combineable(ns2, live_tensors)
+
         for k, s2 in s.items():
             for s3 in s2:
-                s3.consolidate(next_live_tensors, resource2capacity)
+                s3.consolidate(next_live_tensors, resource2capacity, shared_tensors)
             s[k] = SIM.combine_combineable(s2, next_and_prev_live_tensors)
+            
+        # We freed these in the consolidation step
+        for ns2 in [s, ns]:
+            for ns3 in ns2.values():
+                for ns4 in ns3:
+                    for t in list(ns4.tensors):
+                        if t not in next_live_tensors:
+                            del ns4.tensors[t]
 
         DO_PRINT = True
+        DELAY_MERGE = True
 
         combined: list[SIM] = []
         for k in s:
@@ -57,12 +68,19 @@ def fuse_sims(sims: list[SIM], resource2capacity: dict=None, return_nmappings_nb
                     ns: SIM
                     if DO_PRINT:
                         print(f"\t{a.tiling_str()} {a.get_shared_loop_index(live_tensors)} <--> {b.tiling_str()}{b.get_shared_loop_index(next_and_prev_live_tensors)}. ({len(a.mapping.data)})x({len(b.mapping.data)})")
-                    combined.append(a.merge_next(b, next_live_tensors, resource2capacity, delay=True))
+                    if not sims:
+                        print(a.merge_next(b, next_live_tensors, resource2capacity, delay=False))
+                    combined.append(a.merge_next(b, next_live_tensors, resource2capacity, delay=DELAY_MERGE))
             elif DO_PRINT:
                 print(f"\tNo match for {k} ||||||||| {s[k][0].tiling_str()}")
 
-        for c, mapping in zip(combined, Parallel(n_jobs=128)(c.mapping for c in combined)):
-            c.mapping = mapping
+        if DELAY_MERGE:
+            for c, mapping in zip(combined, Parallel(n_jobs=128)(c.mapping for c in combined)):
+                c.mapping = mapping
+        else:
+            for c, mapping in zip(combined, (c.mapping for c in combined)):
+                c.mapping = mapping
+            
         print(f"\tCombining {sum(len(s2) for s2 in s)}({len(s)}) x {sum(len(s2) for s2 in ns)}({len(ns)}) -> {len(combined)}")
         if DO_PRINT:
             for k in ns:
diff --git a/pytimeloop/fastfusion/pareto.py b/pytimeloop/fastfusion/pareto.py
@@ -59,7 +59,7 @@ def nameloop2col(name, nloops, left: bool=False):
 def is_left_col(x):
     return "_LEFT_LEVEL_" in x
 
-MERGE_SUFFIXES = ["_RIGHT_MERGE", "_LEFT_MERGE"]
+MERGE_SUFFIXES = ["_LEFT_MERGE", "_RIGHT_MERGE"]
 
 def is_merge_col(c):
     return any(c.endswith(s) for s in MERGE_SUFFIXES)
@@ -109,21 +109,23 @@ def makepareto(data: pd.DataFrame) -> pd.DataFrame:
         return data
     return data[paretoset(data[columns])].reset_index(drop=True)
 
-def squish_left_right(data: pd.DataFrame):
+def squish_left_right(data: pd.DataFrame, shared_loop_index: int=None):
     nloops2left = defaultdict(set)
+    dropcols = []
     for c in data.columns:
         if (name_nloops := col2nameloop(c)) is not None:
             if is_left_col(c):
                 name, nloops = name_nloops
-                nloops2left[nloops].add((c, name))
+                if shared_loop_index is None or nloops == shared_loop_index:
+                    nloops2left[nloops].add((c, name))
+                    dropcols.append(c)
             
     for n in nloops2left.keys():
         for c, name in nloops2left[n]:
             target = nameloop2col(name, n)
             max_to_col(data, target, c)
             
-    keepcols = [c for c in data.columns if not is_left_col(c)]
-    return data[keepcols]
+    return data[[c for c in data.columns if c not in dropcols]]
 
 def free_to_loop_index(data: pd.DataFrame, shared_loop_index: int, skip_pareto: bool=False) -> pd.DataFrame:
     nloops2left = defaultdict(set)
@@ -179,6 +181,7 @@ def merge_cross(
     as_pareto: bool = False,
 ) -> pd.DataFrame:
     left = free_to_loop_index(left, shared_loop_index + 1)
+    left = squish_left_right(left, shared_loop_index + 1)
     for c in left.columns:
         if (name_nloops := col2nameloop(c)) is not None:
             if c not in right.columns:
@@ -221,7 +224,6 @@ def merge_cross(
     #  *  Can't bake into compatiblity unless we have a notion of left vs.
     #     right pipelined.
     
-    
     # PIPELINE CHANGES REQUIRED:
     # - Latency above above loop index (first tile), below (all subsequent tiles)
     # - Tiling includes information for how may be fused:
@@ -277,6 +279,31 @@ def merge_cross(
     # Update the IN_PROGRESS_STATS
     for i, r in df[cols].iterrows():
         df.at[i, IN_PROGRESS_STATS][last] = r.to_dict()
+        
+    CHECK_CORRECTNESS = False
+    if CHECK_CORRECTNESS:
+        from pytimeloop.fastfusion.plot.looptree import tilings2looptree
+        df_check = free_to_loop_index(df.copy(), -1, skip_pareto=True)
+        for i, r in df_check.iterrows():
+            looptree = tilings2looptree(r[MAPPING], r[STATS], r[TENSORS], r[IN_PROGRESS_STATS], skip_backing_tensors=next_live_tensors)
+            reservations = dict(looptree.get_reservations())
+            for k, v in reservations.items():
+                col = nameloop2col(k, -1)
+                if col not in df_check.columns:
+                    got = r[[c for c in df_check.columns if col2nameloop(c) is not None]]
+                    raise ValueError(f"Missing {k}: Expected {reservations}. Got: {got}")
+                if r[col] != v:
+                    got = r[[c for c in df_check.columns if col2nameloop(c) is not None]]
+                    raise ValueError(f"Mismatched {k}: {v} != {r[col]}. Expected {reservations}. Got: {got}")
+                # import pydot
+                # graph = pydot.Dot(graph_type="digraph", ranksep="0.2", nodesep="0.2")
+                # looptree.to_pydot(graph)
+                # with open(f"test.png", "wb") as f:
+                #     f.write(graph.create_png())
+                # all_tensors = set(t for tn in r[TENSORS].values() for t in tn)
+                # for t in sorted(all_tensors):
+                #     print(f"{t.__repr__()},")
+
 
     # Assert no NaNs
     assert not df.isnull().values.any()
@@ -297,8 +324,6 @@ def concat(paretos: list["Pareto"]) -> "Pareto":
 
     def merge(self, other: "Pareto", shared_loop_index: int, next_shared_loop_index: int, resource2capacity: dict[str, int], next_live_tensors: set[int], delay: bool=False) -> "Pareto":
         d = delayed(merge_cross)(self.data, other.data, shared_loop_index, next_shared_loop_index, resource2capacity, next_live_tensors=next_live_tensors, as_pareto=True)
-        if not delay:
-            print("AHH")
         return d if delay else d[0](*d[1], **d[2])
 
     @staticmethod
diff --git a/pytimeloop/fastfusion/plot/looptree.py b/pytimeloop/fastfusion/plot/looptree.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 import pydot
-from typing import Any
+from typing import Any, Iterable
 from pytimeloop.fastfusion.sim import Tiling, TensorStorage, Loop
 from pytimeloop.fastfusion.util import expfmt
 from pytimeloop.fastfusion.pareto import IN_PROGRESS_STATS
@@ -38,7 +38,7 @@ def _to_yaml(self):
     def to_yaml(self):
         return {"mapping": "fused", "nodes": self._to_yaml()}
 
-    def to_pydot(self, graph, parent=None, invisible_root: bool = True):
+    def to_pydot(self, graph, parent=None, invisible_root: bool = False):
         label_lines = []
         for t in self.this_level:
             label_lines.append(t.pydot_str() if hasattr(t, "pydot_str") else str(t))
@@ -49,7 +49,8 @@ def to_pydot(self, graph, parent=None, invisible_root: bool = True):
             node = pydot.Node(id(self), label=node_label, **PYDOT_NODE_DEFAULTS)
             graph.add_node(node)
         if parent:
-            graph.add_edge(pydot.Edge(parent, node))
+            reservations = "\n".join(sorted(f"[{k}] {expfmt(v)}" for k, v in self.get_reservations().items()))
+            graph.add_edge(pydot.Edge(parent, node, label=reservations))
         for child in self.children:
             child.to_pydot(graph, node, invisible_root=False)
 
@@ -59,9 +60,18 @@ def add_stats(self, stats: dict[str, Any]):
         else:
             for k, v in stats.items():
                 self.this_level.append(f"{k}: {expfmt(v)}")
+                
+    def get_reservations(self) -> dict[str, int]:
+        reservations = defaultdict(lambda: 0)
+        for c in self.children:
+            for k, v in c.get_reservations().items():
+                reservations[k] = max(reservations[k], v)
+        for t in self.this_level:
+            if isinstance(t, TensorStorage):
+                reservations[t.backer_id] += t.tile_size
+        return reservations
 
-
-def tilings2looptree(mappings: dict[str, Tiling], stats: dict[str, Any], tensors: dict[str, list[TensorStorage]], partial_stats: dict[str, Any]):
+def tilings2looptree(mappings: dict[str, Tiling], stats: dict[str, Any], tensors: dict[str, list[TensorStorage]], partial_stats: dict[str, Any], skip_backing_tensors: Iterable[str] = ()):
     prev_tiling = None
     root = Node()
     einsum_ids = list(mappings.keys())
@@ -79,27 +89,36 @@ def tilings2looptree(mappings: dict[str, Tiling], stats: dict[str, Any], tensors
             n.children.append(Node())
             n = n.children[-1]
         n.children.append(Node()) # Leaf node
-        for tensor in tiling.tensors:
-            root.access_level(tensor.above_loop_index).this_level.append(tensor)
+        id2tensor = defaultdict(lambda: [])
+        for t in tiling.tensors:
+            id2tensor[t.tensor_id].append(t)
+        id2tensor = {k: sorted(v, key=lambda x: (x.above_loop_index, x.backer_id)) for k, v in id2tensor.items()}
+        for tensor_id, storages in id2tensor.items():
+            if tensor_id in skip_backing_tensors:
+                storages = storages[1:]
+            for tensor in storages:
+                if tensor not in n.this_level:
+                    root.access_level(tensor.above_loop_index).this_level.append(tensor)
         for i, l in enumerate(loops):
             root.access_level(index + i + 1).this_level.append(l)
-        root.add_stats(stats[einsum_id])
         last_level = root.access_level(None).this_level
-        for tensor in tiling.tensors:
-            if tensor not in last_level:
-                last_level.append(tensor)
-                total_resources[tensor.backer_id] += tensor.tile_size
+        first_level = root.access_level(0).this_level
         for tensor in tensors[einsum_id]:
-            if tensor not in last_level:
-                last_level.append(tensor.pydot_str() + "**")
-                total_resources[tensor.backer_id] += tensor.tile_size
+            if tensor.tensor_id not in skip_backing_tensors:
+                if tensor not in mappings[einsum_id].tensors:
+                    # tensor = TensorStorage(
+                    #     f"*{tensor.tensor_id}", 
+                    #     tensor.backer_id, 
+                    #     tensor.above_loop_index, 
+                    #     tensor.tile_size
+                    # )
+                    first_level.append(tensor)
+                    total_resources[tensor.backer_id] += tensor.tile_size
         for k, v in total_resources.items():
             last_level.append(f"({k}) TOTAL: {expfmt(v)}")
-            
+        root.add_stats(stats[einsum_id])
         for k, v in partial_stats[einsum_id].items():
             last_level.append(f"_PARTIAL {k}: {expfmt(v)}")
-
-
         prev_tiling = tiling
     return root
 
diff --git a/pytimeloop/fastfusion/sim.py b/pytimeloop/fastfusion/sim.py
@@ -53,8 +53,10 @@ def __str__(self):
         return ("S-" if self.is_spatial else "") + f"{self.rank_id}-{self.bound}"
     
     def pydot_str(self):
-        return f"{self.rank_id} sz {expfmt(self.bound)} {'S' if self.is_spatial else ''} * {expfmt(self.n_repititions)}"
-
+        if self.is_spatial:
+            return f"S-for R{self.rank_id} size {expfmt(self.bound)}"
+        return f"for {self.rank_id} size {expfmt(self.bound)}"
+            
     def rename(self, rank_renaming: dict[str, str], tensor_renaming: dict[str, str]) -> "Loop":
         return Loop(rank_renaming[self.rank_id], self.bound, self.is_spatial)
     
@@ -85,14 +87,14 @@ def ts(self):
         return self.tile_size
 
     def __str__(self):
-        return f"({self.backer_id}) {self.tensor_id} sz {expfmt(self.tile_size)} above {self.above_loop_index}"# x{expfmt(self.n_repititions)}"
+        return f"[{self.backer_id}] {self.tensor_id} sz {expfmt(self.tile_size)} above {self.above_loop_index}"# x{expfmt(self.n_repititions)}"
 
     def __repr__(self):
         return f"TensorStorage({self.tensor_id}, {self.backer_id}, {self.above_loop_index}, {self.tile_size})"#, {self.n_repititions})"
     
     def pydot_str(self):
-        return f"({self.backer_id}) {self.tensor_id} size " \
-            f"{expfmt(self.tile_size)}"#*{expfmt(self.n_repititions)}={expfmt(self.tile_size)}"# * self.n_repititions)}"
+        return f"[{self.backer_id}] T{self.tensor_id} size {expfmt(self.tile_size)}"
+            #*{expfmt(self.n_repititions)}={expfmt(self.tile_size)}"# * self.n_repititions)}"
     
     def rename(self, rank_renaming: dict[str, str], tensor_renaming: dict[str, str]) -> "TensorStorage":
         return TensorStorage(
@@ -111,6 +113,10 @@ def to_yaml(self):
             "above_loop_index": self.above_loop_index,
             "tile_size": self.tile_size,
         }
+        
+class TensorStorage2(TensorStorage):
+    def __repr__(self):
+        return f"TensorStorage2({self.tensor_id}, {self.backer_id}, {self.above_loop_index}, {self.tile_size})"
 
 
 @dataclass(frozen=True)
@@ -209,6 +215,8 @@ def merge_next(self, n: "SIM", next_live_tensors: set[str], resource2capacity: d
         shared_loop_index = self.tiling.shared_loop_index(n.tiling.tensor_names)
         tiling = n.tiling.absorb_tensors(self.tiling, next_live_tensors)
         next_shared_loop_index = tiling.shared_loop_index(next_live_tensors)
+        # assert all(t.tensor_id in next_live_tensors for t in tiling.tensors), f"Did not free all dead tensors {tiling.tensors} {next_live_tensors}"
+        # assert all
         mapping = self.mapping.merge(n.mapping, shared_loop_index, next_shared_loop_index, resource2capacity, next_live_tensors, delay=delay)
         s = SIM(tiling, mapping)
         assert len(tiling.loops) == next_shared_loop_index + 1, f"{self.tiling} {n.tiling} {next_shared_loop_index + 1} -> {tiling} {len(tiling.loops)}"
@@ -220,8 +228,10 @@ def get_shared_loop_index(self, next_live_tensors: set[str]) -> int:
         live_tensors = list(self.tiling.tensor_names) + [next_live_tensors]
         return self.tiling.shared_loop_index(live_tensors)
 
-    def consolidate(self, next_live_tensors: set[str] = None, resource2capacity: dict[str, int] = None):
+    def consolidate(self, next_live_tensors: set[str] = None, resource2capacity: dict[str, int] = None, shared_tensors: set[str] = None):
         dead_tensors = set(self.tensors) - (next_live_tensors or set())
+        shared_tensors = shared_tensors or set()
+        shared_loop_index = self.tiling.shared_loop_index(shared_tensors | next_live_tensors)
         for t in dead_tensors:
             self._free_tensor(t)
         if next_live_tensors is None:
@@ -231,9 +241,8 @@ def consolidate(self, next_live_tensors: set[str] = None, resource2capacity: dic
             # Can free the deepest of:
             # - The shared loop with the next SIM
             # - My deepest loop that hasn't yet been freed
-            shared_loop_index = self.tiling.shared_loop_index(next_live_tensors)
-            if self.tensors:
-                shared_loop_index = max(shared_loop_index, max(t.above_loop_index for t in self.tensors.values()))
+            # if self.tensors:
+            #     shared_loop_index = max(shared_loop_index, max(t.above_loop_index for t in self.tensors.values()))
             self.mapping.free_to_loop_index(shared_loop_index+1, resource2capacity)
 
     def __eq__(self, other):