WIP tagging

tanner-andrulis · tanner-andrulis · commit 898f0b1fcd69 · 2025-01-24T12:42:31.000-05:00
diff --git a/pytimeloop/fastfusion/filter_mappings.py b/pytimeloop/fastfusion/filter_mappings.py
@@ -0,0 +1,110 @@
+from collections import defaultdict
+from pytimeloop.fastfusion.sim import Tag, TensorStorage, Tiling
+        
+def get_ffmt_tag_mha(
+        einsum_id: str, 
+        backing_storages: set[TensorStorage], 
+        input_tensors: set[str],
+        output_tensors: set[str],
+        tiling: Tiling,
+        rank_name_to_shared_name: dict[str, str]
+    ):
+    B, H, M, F, P, G, E, D = (x + einsum_id for x in "BHMFPGED")
+    
+    einsum_id_to_input_output = {
+        "Q": ["I_I_to_Q_K_V", "Q_Q_to_QK"],
+        "K": ["I_I_to_Q_K_V", None], # NOTE: TANNER ADDED THESE
+        "V": ["I_I_to_Q_K_V", None], # NOTE: TANNER ADDED THESE
+        "QK": ["Q_Q_to_QK", "QK_QK_to_AV"],   # NOTE: K IS MISSING
+        "AV": ["QK_QK_to_AV", "AV_AV_to_Z"], # NOTE: AV IS MISSING
+        "Z": ["AV_AV_to_Z", "Z_Z_to_n"],
+    }
+    a, b = einsum_id_to_input_output[einsum_id]
+    
+    tags = []
+    
+    min_weight_index = None
+    max_non_weight_index = 0
+    first, last = True, True
+    for t in tiling.tensors:
+        if t.backer_id != 1:
+            continue
+        if t.tensor_id in input_tensors and t in backing_storages:
+            first = False
+        if t.tensor_id in output_tensors and t in backing_storages:
+            last = False
+        # if "W_n_to_" in t.tensor_id:
+        if t.tensor_id != a and t.tensor_id != b: # Weights!
+            if min_weight_index is None:
+                min_weight_index = t.above_loop_index
+            else:
+                min_weight_index = min(min_weight_index, t.above_loop_index)
+        else:
+            max_non_weight_index = max(max_non_weight_index, t.above_loop_index)
+ 
+    if min_weight_index == 2:
+        tags.append("FFMT_WEIGHTS_UNTILED")
+    elif min_weight_index is None or min_weight_index < max_non_weight_index:
+        tags.append("FFMT_WEIGHTS_INVALID")
+    else:
+        tags.append("FFMT_WEIGHTS_TILED")
+
+    to_try = [([B, H], (2, 2)), ([B, H, M], (3, 3))]
+    other_ranks = {
+        "Q": [B, H, M, E, D],
+        "K": [B, H, M, E, D],
+        "V": [B, H, M, E, D],
+        "QK": [B, H, M, P, E],
+        "AV": [B, H, M, F, P],
+        "Z": [B, H, M, G],
+    }[einsum_id]
+
+    valid = False
+    if first and last: # Unfused
+        to_try = []
+        valid = True
+        tags.append("FFMT_UNFUSED")
+    elif first: # First Einsum in a chain
+        to_try += [(other_ranks[:4], (3, 4)), (other_ranks, (5, 4))]
+        tags.append("FFMT_FIRST")
+    elif last: # Last Einsum in a chain
+        to_try += [(other_ranks[4:], (3, 4))]
+        tags.append("FFMT_LAST")
+    else: # Middle Einsum in a chain
+        if einsum_id == "AV":
+            a, b = b, a
+            other_ranks[-2], other_ranks[-1] = other_ranks[-1], other_ranks[-2]
+        to_try += [(other_ranks[:4], (3, 4))]
+        tags.append("FFMT_MIDDLE")
+
+    for i, (c, (a_loops, b_loops)) in enumerate(to_try):
+        perm = [rank_name_to_shared_name[x] for x in c] + ["*"]
+        check_tensors = [TensorStorage(a, a_loops, 1, "*")]
+        if b is not None:
+            check_tensors.append(TensorStorage(b, b_loops, 1, "*"))
+        if tiling.matches_permutation(perm):
+            valid = valid
+        if tiling.matches_permutation(perm) and tiling.has_tensor(*check_tensors):
+            valid = True
+            # tags.append(f"FFMT_VALID_{i}")
+
+    # return ("FFMT_VALID" if valid else "FFMT_INVALID", weight_tag)
+    if valid:# and weight_tag != "INVALID":
+        return ("FFMT_VALID", *tags)
+    return ("FFMT_INVALID",)
+    
+def get_tileflow_tag_mha(
+        einsum_id: str, 
+        backing_storages: set[TensorStorage], 
+        input_tensors: set[str],
+        output_tensors: set[str],
+        tiling: Tiling,
+        rank_name_to_shared_name: dict[str, str]
+    ):
+    # Valid iff it's an even mapping
+    storage2level = defaultdict(set)
+    for ts in tiling.tensors:
+        storage2level[ts.backer_id].add(ts.above_loop_index)
+    if all(len(s) == 1 for s in storage2level.values()):
+        return ("TILEFLOW_VALID",)
+    return ("TILEFLOW_INVALID",)
diff --git a/pytimeloop/fastfusion/mapper/mapper.py b/pytimeloop/fastfusion/mapper/mapper.py
@@ -146,10 +146,7 @@ def generate_data(from_einsum: int, to_einsum: int, data, rank_renaming, tensor_
 
 
 def _convert_tiling(tiling: Tiling, rank_renaming, tensor_renaming):
-    return Tiling(
-        loops=tuple(l.rename(rank_renaming, tensor_renaming) for l in tiling.loops),
-        tensors=frozenset(ts.rename(rank_renaming, tensor_renaming) for ts in tiling.tensors),
-    )
+    return tiling.rename(rank_renaming, tensor_renaming)
 
 
 def _convert_stats(from_einsum: int, to_einsum: int, stats, rank_renaming, tensor_renaming):
diff --git a/pytimeloop/fastfusion/mapper/mapper_snowcat.py b/pytimeloop/fastfusion/mapper/mapper_snowcat.py
@@ -63,10 +63,13 @@ def mapper(
         separated_einsums = None
     else:
         separated_einsums = get_ffmt_separated_einsums(workload)
-    grouped_similar_einsums = convert_rank_to_group_renaming(
-        detect_similar_einsums(workload, analyzer, separated_einsums),
-        equivalent_groups
-    )
+    if not tag_with:
+        grouped_similar_einsums = convert_rank_to_group_renaming(
+            detect_similar_einsums(workload, analyzer, separated_einsums),
+            equivalent_groups
+        )
+    else:
+        grouped_similar_einsums = {einsum: {} for einsum in workload.einsum_id_to_name()}
     logger.info(f"Found {len(grouped_similar_einsums)} unique Einsums\n"
                 + f"\tConverter: {grouped_similar_einsums}")
 
@@ -128,15 +131,7 @@ def generate_data(from_einsum: int, to_einsum: int, data, rank_renaming, tensor_
 
 
 def _convert_tiling(tiling: Tiling, rank_renaming, tensor_renaming):
-    return Tiling(
-        loops=tuple(Loop(rank_renaming[l.rank_id], l.bound, l.is_spatial)
-                    for l in tiling.loops),
-        tensors=frozenset(TensorStorage(tensor_renaming[ts.tensor_id],
-                                        ts.above_loop_index,
-                                        ts.backer_id,
-                                        ts.tile_size)
-                          for ts in tiling.tensors)
-    )
+    return tiling.rename(rank_renaming, tensor_renaming)
 
 
 def _convert_stats(from_einsum: int, to_einsum: int, stats, rank_renaming, tensor_renaming):
diff --git a/pytimeloop/fastfusion/mapper/per_einsum_mapper_snowcat.py b/pytimeloop/fastfusion/mapper/per_einsum_mapper_snowcat.py
@@ -86,6 +86,13 @@ def per_einsum_mapper_snowcat(
 
         partial_mappings = list(dependent_product(parallelized_spaces))
         partial_mappings = [x if isinstance(x, tuple) else (x,) for x in partial_mappings]
+        rank_id_to_name = {v: k for k, v in rank_name_to_id.items()}
+        tensor_id_to_name = {v: k for k, v in tensor_name_to_id.items()}
+        input_tensors = set(tensor_id_to_name[t] for t in workload.tensors_read_by_einsum(einsum_id))
+        output_tensors = set(tensor_id_to_name[t] for t in workload.tensors_written_by_einsum(einsum_id))
+        rank_name_to_shared_name = {
+            rank_id_to_name[k]: rank_id_to_name[v] for k, v in equivalent_groups.rank_to_group_id.items()
+        }
 
         # successful_partial_mappings = []
         # for p in partial_mappings:
@@ -148,8 +155,11 @@ def per_worker_exploration(*args):
                         einsum_shape=einsum_shape,
                         metrics=metrics,
                         einsum_id_to_name=einsum_id_to_name,
-                        rank_id_to_name={v: k for k, v in rank_name_to_id.items()},
-                        tensor_id_to_name={v: k for k, v in tensor_name_to_id.items()},
+                        rank_id_to_name=rank_id_to_name,
+                        tensor_id_to_name=tensor_id_to_name,
+                        rank_name_to_shared_name=rank_name_to_shared_name,
+                        input_tensors=input_tensors,
+                        output_tensors=output_tensors,
                         tag_with=tag_with,
                     )
             return result
@@ -161,7 +171,7 @@ def per_worker_exploration(*args):
         data[einsum_id] = defaultdict(list)
         for res in results:
             for k, v in res.items():
-                data[einsum_id][k] += v
+                data[einsum_id][k[0]] += v
 
     return data
 
diff --git a/pytimeloop/fastfusion/mapper/process_results.py b/pytimeloop/fastfusion/mapper/process_results.py
@@ -13,6 +13,7 @@
     RESERVED_COLUMNS,
     TENSORS,
     IN_PROGRESS_STATS,
+    TAGS,
 )
 from pytimeloop.fastfusion.sim import TensorStorage, Tiling, Loop
 
@@ -37,7 +38,6 @@ def all_metrics(cls):
 
 # DEBUG_VISUALIZATION = Metrics.ALL_TENSORS | METRICS.PARTIAL_STATS
 
-
 def process_result(
     result,
     shape,
@@ -54,6 +54,9 @@ def process_result(
     einsum_id_to_name,
     rank_id_to_name,
     tensor_id_to_name,
+    rank_name_to_shared_name,
+    input_tensors: set[str],
+    output_tensors: set[str],
     logfunc=None,
     metrics=Metrics.all_metrics(),
     tag_with: tuple[callable] = (),
@@ -75,7 +78,7 @@ def process_result(
     )
 
     cur_idx = 0
-    all_backing_storages = []
+    backing_storages = []
     all_storages = []
     intermediates_to_find = set(intermediate_tensors)
     found_tensors = set()
@@ -95,7 +98,7 @@ def record_storage(node):
                 intermediates_to_find.remove(storage.tensor_id)
             if storage.tensor_id not in found_tensors:
                 found_tensors.add(storage.tensor_id)
-                all_backing_storages.append(storage)
+                backing_storages.append(storage)
 
         logstring.append(f"Strg({node['dspace']} in {node['target']})")
 
@@ -107,6 +110,7 @@ def record_loop(node):
             tile_shape = shape[cur_idx]
             cur_idx += 1
         rank_id = equiv_groups.rank_to_group_id[node["rank"]]
+        # rank_id = node["rank"]
         loop = Loop(
             rank_id_to_name[rank_id],
             tile_shape,
@@ -124,21 +128,33 @@ def record_loop(node):
         elif node["type"] == "spatial" or node["type"] == "temporal":
             record_loop(node)
 
-    n_fused_loops = max(t.above_loop_index for t in all_backing_storages)
+    n_fused_loops = max(t.above_loop_index for t in backing_storages)
     tiling_full = Tiling(
         loops=tuple(full_tiling),
         tensors=frozenset(all_storages),
     )
+    
+    tagger_args = dict(
+        einsum_id=einsum_id,
+        backing_storages=backing_storages,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        tiling=tiling_full,
+        rank_name_to_shared_name=rank_name_to_shared_name,
+    )
+    # print(tiling_full)
 
     tiling_compatibility = Tiling(
         loops=tuple(full_tiling[:n_fused_loops]),
-        tensors=frozenset(all_backing_storages),
-        # tags=fzs().union(*([set()] + [set(t(einsum_id, tiling_full)) for t in tag_with]))
+        tensors=frozenset(backing_storages),
+        tags=fzs().union(*([set()] + [set(t(**tagger_args)) for t in tag_with]))
     )
-
-    # assert max(t.above_loop_index for t in all_backing_storages) == len(tiling_compatibility.loops), (
+    
+    if "FFMT_VALID" in tiling_compatibility.tags:
+        print(tiling_compatibility)
+    # assert max(t.above_loop_index for t in backing_storages) == len(tiling_compatibility.loops), (
     #     f"\n\ttiling_compatibility: {tiling_compatibility} "
-    #     f"\n\tall_backing_storages: {all_backing_storages} "
+    #     f"\n\tbacking_storages: {backing_storages} "
     #     f"\n\ttiling_full: {tiling_full}"
     # )
 
@@ -150,14 +166,14 @@ def record_loop(node):
     if Metrics.ENERGY in metrics:
         results["Energy"] = energy
 
-    offchip_accesses = 0
+    offchip_ac = 0
     for (level, tensor, einsum), count in accesses.items():
         if level == 0:
-            offchip_accesses += count
+            offchip_ac += count
         logstring.append(f"Ac_{level}_{tensor}={count:.2e}")
 
     if Metrics.OFF_CHIP_ACCESSES in metrics:
-        results["Offchip_Ac"] = offchip_accesses
+        results["Offchip Accesses"] = offchip_ac
 
     logstring.append(f"{result.fanout}")
 
@@ -166,7 +182,7 @@ def record_loop(node):
     # be backed
     for r in all_storages:
         r: TensorStorage
-        if r not in all_backing_storages:
+        if r not in backing_storages:
             key = nameloop2col(r.backer_id, r.above_loop_index)
             results.setdefault(key, 0)
             results[key] += r.tile_size
@@ -184,15 +200,18 @@ def record_loop(node):
     logstring.append(f"Results: {results}")
     results[LOGSTRING] = {einsum_id: str(logstring)}
     results[MAPPING] = {einsum_id: tiling_full}
-    results[TENSORS] = {einsum_id: all_backing_storages}
+    results[TENSORS] = {einsum_id: backing_storages}
     results[STATS] = {
         einsum_id: {k: v for k, v in results.items() if k not in RESERVED_COLUMNS}
     }
     results[IN_PROGRESS_STATS] = {einsum_id: {}}
     results[MAPPING_HASH] = {einsum_id: hash((einsum_id, tiling_compatibility))}
+    results[TAGS] = {einsum_id: tiling_compatibility.tags}
+
+    key = (tiling_compatibility, fzs(results.keys()))
 
     is_pareto = True
-    for prev_stats in compatibility_to_df[tiling_compatibility]:
+    for prev_stats in compatibility_to_df[key]:
         keys = [k for k in results if k not in DICT_COLUMNS]
         if (
             fzs(prev_stats.keys()) == fzs(results.keys())
@@ -204,6 +223,6 @@ def record_loop(node):
     # TO DO: Index into the DF with both tiling compatibility and
     # the result keys
     if is_pareto:
-        compatibility_to_df[tiling_compatibility].append(results)
+        compatibility_to_df[key].append(results)
     results_return = {k: v for k, v in results.items() if k != LOGSTRING}
     return is_pareto, results_return, logstring
diff --git a/pytimeloop/fastfusion/mapper/simexplore.py b/pytimeloop/fastfusion/mapper/simexplore.py
@@ -100,6 +100,9 @@ def fuse_sims(
             resource2capacity=resource2capacity,
             shared_tensors=set(),
         )
+        
+    # TODO: Lookahead by one SIM. If we're going to create a tiling that has loops
+    # that are not in the ranks of the next SIM, we should drop that tiling.
 
     while sims:
         nbuckets.append(len(left))
diff --git a/pytimeloop/fastfusion/pareto.py b/pytimeloop/fastfusion/pareto.py
@@ -26,12 +26,13 @@
 TENSORS = "__TENSORS"
 IN_PROGRESS_STATS = "__IN_PROGRESS_STATS"
 MAPPING_HASH = "__MAPPING_HASH"
+TAGS = "__TAGS"
 
 RESERVED_COLUMNS = set(
-    [LOGSTRING, MAPPING, STATS, TENSORS, IN_PROGRESS_STATS, MAPPING_HASH]
+    [LOGSTRING, MAPPING, STATS, TENSORS, IN_PROGRESS_STATS, MAPPING_HASH, TAGS]
 )
 DICT_COLUMNS = set(
-    [LOGSTRING, MAPPING, STATS, TENSORS, IN_PROGRESS_STATS, MAPPING_HASH]
+    [LOGSTRING, MAPPING, STATS, TENSORS, IN_PROGRESS_STATS, MAPPING_HASH, TAGS]
 )
 
 _resource_name_nloops_reg = re.compile(r"RESOURCE_(.+?)(?:_LEFT)?_LEVEL_(-?\d+)")
diff --git a/pytimeloop/fastfusion/plot/interactive.py b/pytimeloop/fastfusion/plot/interactive.py
diff --git a/pytimeloop/fastfusion/plot/ski_slope.py b/pytimeloop/fastfusion/plot/ski_slope.py
diff --git a/pytimeloop/fastfusion/sim.py b/pytimeloop/fastfusion/sim.py

Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,9 @@ def fuse_sims(`
`100`	`100`	`resource2capacity=resource2capacity,`
`101`	`101`	`shared_tensors=set(),`
`102`	`102`	`)`
	`103`	`+`
	`104`	`+ # TODO: Lookahead by one SIM. If we're going to create a tiling that has loops`
	`105`	`+ # that are not in the ranks of the next SIM, we should drop that tiling.`
`103`	`106`
`104`	`107`	`while sims:`
`105`	`108`	`nbuckets.append(len(left))`
Original file line number	Diff line number	Diff line change
`@@ -26,12 +26,13 @@`
`26`	`26`	`TENSORS = "__TENSORS"`
`27`	`27`	`IN_PROGRESS_STATS = "__IN_PROGRESS_STATS"`
`28`	`28`	`MAPPING_HASH = "__MAPPING_HASH"`
	`29`	`+TAGS = "__TAGS"`
`29`	`30`
`30`	`31`	`RESERVED_COLUMNS = set(`
`31`		`- [LOGSTRING, MAPPING, STATS, TENSORS, IN_PROGRESS_STATS, MAPPING_HASH]`
	`32`	`+ [LOGSTRING, MAPPING, STATS, TENSORS, IN_PROGRESS_STATS, MAPPING_HASH, TAGS]`
`32`	`33`	`)`
`33`	`34`	`DICT_COLUMNS = set(`
`34`		`- [LOGSTRING, MAPPING, STATS, TENSORS, IN_PROGRESS_STATS, MAPPING_HASH]`
	`35`	`+ [LOGSTRING, MAPPING, STATS, TENSORS, IN_PROGRESS_STATS, MAPPING_HASH, TAGS]`
`35`	`36`	`)`
`36`	`37`
`37`	`38`	`_resource_name_nloops_reg = re.compile(r"RESOURCE_(.+?)(?:_LEFT)?_LEVEL_(-?\d+)")`