Skip to content

Commit d67b482

Browse files
committed
Merge branch 'main' of github.com:Accelergy-Project/timeloop-python
2 parents cd7901f + d4a80f5 commit d67b482

6 files changed

Lines changed: 184 additions & 48 deletions

File tree

pytimeloop/fastfusion/mapper/mapper_snowcat.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ def mapper(
3030
explore_glb_uneven,
3131
spec,
3232
tmp_path,
33-
ffmt: bool=False
33+
ffmt: bool=False,
34+
ffmt_refetch_weights: bool=True,
3435
):
3536
logger.info(f"Calling mapper for {spec}")
3637

@@ -65,7 +66,8 @@ def mapper(
6566
explore_glb_uneven=explore_glb_uneven,
6667
spec=spec,
6768
energy_dict=energy_dict,
68-
ffmt=ffmt
69+
ffmt=ffmt,
70+
ffmt_refetch_weights=ffmt_refetch_weights,
6971
)
7072

7173
generated_data = {}
@@ -132,12 +134,12 @@ def detect_similar_einsums(workload, analyzer, separated_einsums=None):
132134
found = False
133135
for from_einsum in ref_to_to_einsums:
134136
rank_renaming, tensor_renaming = is_equivalent(from_einsum,
135-
einsum,
136-
workload,
137-
analyzer)
137+
einsum,
138+
workload,
139+
analyzer)
138140
if rank_renaming is not None:
139141
ref_to_to_einsums[from_einsum][einsum] = (rank_renaming,
140-
tensor_renaming)
142+
tensor_renaming)
141143
found = True
142144
break
143145
if not found:
@@ -158,6 +160,14 @@ def convert_rank_to_group_renaming(ref_to_to_einsums, equiv_ranks):
158160

159161

160162
def get_ffmt_separated_einsums(workload):
163+
einsum_id_to_name = workload.einsum_id_to_name()
164+
if len(einsum_id_to_name) == 1:
165+
return [{0}]
166+
elif len(einsum_id_to_name) == 2:
167+
return [{0}, {1}]
168+
elif len(einsum_id_to_name) == 3:
169+
return [{0}, {1}, {2}]
170+
161171
first_einsum = {0}
162172
second_einsum = {1}
163173
last_einsum = {max(workload.einsum_id_to_name().keys())}

pytimeloop/fastfusion/mapper/per_einsum_mapper.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -687,4 +687,11 @@ def make_temporal_fors_with_smallest_tile(original, ranks):
687687
mapping = original.copy()
688688
for r in ordered_ranks:
689689
mapping.add_temporal(r, tile_shape=1)
690+
yield mapping
691+
692+
def make_temporal_fors_in_order(original, ranks):
693+
for i in range(len(ranks)+1):
694+
mapping = original.copy()
695+
for r in ranks[:i]:
696+
mapping.add_temporal(r)
690697
yield mapping

pytimeloop/fastfusion/mapper/per_einsum_mapper_snowcat.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
from copy import deepcopy
22
from collections import defaultdict
3-
from collections.abc import Callable, Set
4-
from itertools import combinations, product, permutations
5-
from functools import reduce
6-
from operator import or_, mul
73

84
from joblib import Parallel, delayed
95

@@ -27,7 +23,8 @@ def per_einsum_mapper_snowcat(
2723
explore_glb_uneven,
2824
einsums_to_explore,
2925
energy_dict,
30-
ffmt=False
26+
ffmt=False,
27+
ffmt_refetch_weights=True,
3128
):
3229
data = {}
3330
for einsum_id in einsums_to_explore:
@@ -70,13 +67,15 @@ def per_einsum_mapper_snowcat(
7067
intermediate_tensors,
7168
tensor_to_relevant_ranks,
7269
einsum_id,
73-
workload)
70+
workload,
71+
refetch_weights=ffmt_refetch_weights)
7472

7573
n_jobs=32
7674
parallelized_spaces, task_spaces = \
7775
split_dependent_product(n_split_min=n_jobs, spaces=subspaces)
7876

7977
partial_mappings = list(dependent_product(parallelized_spaces))
78+
partial_mappings = [x if isinstance(x, tuple) else (x,) for x in partial_mappings]
8079

8180
def per_worker_exploration(*args):
8281
analyzer = LooptreeWorkloadDependencyAnalyzer(workload)
@@ -123,12 +122,5 @@ def per_worker_exploration(*args):
123122
for k, v in res.items():
124123
data[einsum_id][k] += v
125124

126-
print(einsum_id)
127-
for k, v in data[einsum_id].items():
128-
min_metric = float("inf")
129-
for m in v:
130-
min_metric = min(min_metric, m["Offchip_Ac"])
131-
print(min_metric)
132-
133125
return data
134126

pytimeloop/fastfusion/mapper/per_einsum_subspaces/snowcat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ def off_chip_storage(mapping):
2424
def fused_temporal_fors(mapping, unfused_tensors):
2525
for partial_mapping in make_temporal_fors(mapping, all_ranks):
2626
# for partial_mapping in make_temporal_fors(mapping, all_ranks):
27-
for partial_mapping in make_temporal_fors_with_smallest_tile(mapping, all_ranks):
28-
yield partial_mapping, unfused_tensors
27+
for partial_mapping in make_temporal_fors_with_smallest_tile(partial_mapping, all_ranks):
28+
yield partial_mapping, unfused_tensors
2929

3030

3131
def glb_storage(mapping, unfused_tensors):
Lines changed: 65 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
from pytimeloop.fastfusion.mapper.per_einsum_mapper import LinearMapping, make_storage, make_temporal_fors, make_temporal_fors_with_smallest_tile
1+
from pytimeloop.fastfusion.mapper.per_einsum_mapper import LinearMapping, make_storage, make_temporal_fors, make_temporal_fors_with_smallest_tile, make_temporal_fors_in_order
22

33
def make_ffmt_subspaces(tensors,
44
intermediate_tensors,
55
tensor_to_relevant_ranks,
66
einsum_id,
7-
workload):
7+
workload,
8+
refetch_weights: bool=True):
9+
810
def off_chip_storage(mapping):
911
off_chip_must_retain = tensors - intermediate_tensors
1012
off_chip_can_retain = intermediate_tensors
@@ -25,44 +27,80 @@ def off_chip_storage(mapping):
2527
M = all_ranks[0]
2628
N = all_ranks[1]
2729
K = all_ranks[2]
28-
29-
if einsum_id == 0:
30-
allowed_fused_ranks = all_ranks
31-
elif einsum_id == 1:
32-
allowed_fused_ranks = {M, K}
33-
elif einsum_id == max(workload.einsum_id_to_name().keys()):
34-
allowed_fused_ranks = {M, N}
35-
else:
36-
allowed_fused_ranks = {M}
30+
weight_tensor = None
31+
input_tensor = None
32+
for tensor_id in workload.tensors_read_by_einsum(einsum_id):
33+
if tensor_to_relevant_ranks[tensor_id] == {K, N}:
34+
weight_tensor = tensor_id
35+
elif tensor_to_relevant_ranks[tensor_id] == {M, K}:
36+
input_tensor = tensor_id
37+
assert weight_tensor is not None
38+
assert input_tensor is not None
39+
output_tensor = next(iter(workload.tensors_written_by_einsum(einsum_id)))
40+
non_weight_tensor = tensors - {weight_tensor}
3741

3842
def fused_temporal_fors(mapping, unfused_tensors):
39-
for partial_mapping in make_temporal_fors(mapping, allowed_fused_ranks):
40-
# for partial_mapping in make_temporal_fors(mapping, all_ranks):
41-
for partial_mapping in make_temporal_fors_with_smallest_tile(mapping, all_ranks):
42-
yield partial_mapping, unfused_tensors
43+
if input_tensor in unfused_tensors:
44+
allowed_fused_ranks = [M, N, K]
45+
elif output_tensor in unfused_tensors:
46+
allowed_fused_ranks = [M, N]
47+
else:
48+
allowed_fused_ranks = [M, K]
49+
for partial_mapping in make_temporal_fors_in_order(mapping, allowed_fused_ranks):
50+
yield partial_mapping, unfused_tensors
4351

4452

45-
def glb_storage(mapping, unfused_tensors):
53+
def glb_storage_io(mapping, unfused_tensors):
4654
glb_fused_tensors = intermediate_tensors - unfused_tensors
4755
yield from make_storage(
4856
mapping,
4957
level=1,
50-
must_retain_tensors=tensors,
58+
must_retain_tensors=non_weight_tensor,
5159
can_retain_tensors=set(),
5260
must_fully_reuse_tensors=glb_fused_tensors,
5361
tensor_to_relevant_ranks=tensor_to_relevant_ranks,
54-
explore_uneven=True,
55-
add_split_at_tensors=glb_fused_tensors
62+
explore_uneven=False,
63+
add_split_at_tensors=glb_fused_tensors,
64+
return_retained_tensors=True,
5665
)
5766

58-
def mac(mapping):
67+
def intra_temporal_fors(mapping, _):
68+
for partial_mapping in make_temporal_fors_with_smallest_tile(mapping,
69+
{K, N}):
70+
yield partial_mapping, _
71+
72+
def glb_storage_weights(mapping, _):
73+
yield from make_storage(
74+
mapping,
75+
level=1,
76+
must_retain_tensors={weight_tensor},
77+
can_retain_tensors=set(),
78+
tensor_to_relevant_ranks=tensor_to_relevant_ranks,
79+
explore_uneven=False,
80+
return_retained_tensors=True,
81+
)
82+
83+
def mac(mapping, _):
5984
mapping.add_compute(einsum_id, 2)
6085
yield mapping
6186

62-
return [
63-
lambda: [LinearMapping()],
64-
off_chip_storage,
65-
fused_temporal_fors,
66-
glb_storage,
67-
mac
68-
]
87+
if refetch_weights:
88+
return [
89+
lambda: [LinearMapping()],
90+
off_chip_storage,
91+
fused_temporal_fors,
92+
glb_storage_io,
93+
intra_temporal_fors,
94+
glb_storage_weights,
95+
mac
96+
]
97+
else:
98+
return [
99+
lambda: [LinearMapping()],
100+
off_chip_storage,
101+
glb_storage_weights,
102+
fused_temporal_fors,
103+
glb_storage_io,
104+
intra_temporal_fors,
105+
mac
106+
]
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
from collections.abc import Mapping
2+
import itertools
3+
4+
import numpy as np
5+
import pandas as pd
6+
from joblib import Parallel, delayed
7+
8+
from pytimeloop.fastfusion.sim import SIM
9+
from pytimeloop.fastfusion.pareto import Pareto
10+
11+
def explore_fusion(einsum_to_result: Mapping):
12+
13+
r2 = {}
14+
for einsum_id, compat_dict in einsum_to_result.items():
15+
r2[einsum_id] = Parallel(n_jobs=1)(delayed(paretofy)(k, v) for k, v in compat_dict.items())
16+
17+
# for einsum_id, compat_dict in result.items():
18+
# r2[einsum_id] = [SIM(k, Pareto(pd.DataFrame(v).fillna(0))) for k, v in compat_dict.items()]
19+
20+
sims = list(r2.values())
21+
s = sims.pop(0)
22+
23+
24+
while sims:
25+
live_tensors = set.union(set(), *[sim[0].tensor_names for sim in sims])
26+
ns = sims.pop(0)
27+
next_live_tensors = set.union(set(), *[sim[0].tensor_names for sim in sims])
28+
29+
for s2 in s:
30+
s2.consolidate(live_tensors)
31+
32+
ns = SIM.combine_combineable(ns, next_live_tensors | s[0].tensor_names)
33+
ns = SIM.group_by_left(ns, s[0].tensor_names)
34+
s = SIM.combine_combineable(s, live_tensors)
35+
s = SIM.group_by_right(s, live_tensors)
36+
37+
print("\n\n")
38+
print("\n\n" + "=" * 100 + f"\n{len(sims) + 1} Remaining\n" + "=" * 100)
39+
40+
DO_PRINT = False
41+
42+
with open('s_keys.txt', 'w') as f:
43+
for key in sorted(s.keys()):
44+
f.write(f"{key}\n")
45+
46+
with open('s2_keys.txt', 'w') as f:
47+
for key in sorted(ns.keys()):
48+
f.write(f"{key}\n")
49+
50+
combined: list[SIM] = []
51+
for k in s:
52+
if k in ns:
53+
for a, b in itertools.product(s[k], ns[k]):
54+
if DO_PRINT:
55+
print(f"\t{a.tiling_str()} <--> {b.tiling_str()}")
56+
combined.append(a.merge_next(b, set(), delay=True))
57+
# combined_keys.append()
58+
elif DO_PRINT:
59+
print(f"\tNo match for {s[k][0].tiling_str()}")
60+
61+
for c, mapping in zip(combined, Parallel(n_jobs=128)(c.mapping for c in combined)):
62+
c.mapping = mapping
63+
64+
s = combined
65+
print(f"Generated {len(s)} solutions")
66+
67+
for s2 in s:
68+
s2.consolidate(set())
69+
s_final = SIM.combine_combineable(s, set())[0]
70+
data = s_final.mapping.data
71+
# Sort data by the columns "Latency" and "Energy"
72+
last_level_occupancy = None
73+
for i in reversed(range(3)):
74+
if f"RESOURCE_1_LEVEL_{i}" not in data:
75+
continue
76+
if last_level_occupancy is not None:
77+
non_left_cur_level_occupancy = data[f"RESOURCE_1_LEVEL_{i}"] + last_level_occupancy
78+
else:
79+
non_left_cur_level_occupancy = data[f"RESOURCE_1_LEVEL_{i}"]
80+
left_cur_level_occupancy = data[f"RESOURCE_1_LEFT_LEVEL_{i}"]
81+
last_level_occupancy = np.maximum(non_left_cur_level_occupancy,
82+
left_cur_level_occupancy)
83+
data["Occupancy"] = last_level_occupancy
84+
85+
return data
86+
87+
88+
def paretofy(k, v):
89+
return SIM(k, Pareto(pd.DataFrame(v).fillna(0)))

0 commit comments

Comments
 (0)