-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathschedules.py
More file actions
192 lines (162 loc) · 7.91 KB
/
schedules.py
File metadata and controls
192 lines (162 loc) · 7.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import tvm
from tvm.ir.module import IRModule
from tvm.tir import Schedule
import tvm.meta_schedule as ms
###################### INTERFACE ######################
def apply_schedule(mod: IRModule, schedule_func: callable , function_name: str) -> IRModule:
schedule = tvm.tir.Schedule(mod)
tasks = mod.get_global_vars()
print([t.name_hint for t in tasks])
for task in tasks:
name = task.name_hint
if function_name in name: # check for "dense", different for "convolution"
print(name, "applying schedule")
if "dense" in name or "relu" in name:
schedule = schedule_func(schedule, name, False, bn=16, factor=4, reorder=True, vectorize=True, unroll=True, parallel=True, blocking=True, packed=False)
else: # TODO add case for conv
schedule = schedule_func(schedule, name)
return schedule.mod
def tune(args, MyModule, target, device_key = None, work_dir = "./tune_tmp", max_trials_global=256, num_trials_per_iter=64):
# print(MyModule)
tasks = MyModule.get_global_vars()
print(f"tune tasks = {tasks}")
for task in tasks:
name = task.name_hint
print(80*'*')
print(f"** task name = {name} **")
if name == "main": # FIXME exclude all non prim_funcs, this works for the moment
continue
#elif "dense" in name: ## DEBUG, skip dense ## DEBUG, skip dense ## DEBUG, skip dense ## DEBUG, skip dense
# continue
mod_linear = tvm.IRModule.from_expr(MyModule[name].with_attr("global_symbol", "main"))
has_schedule = False
if "dense_pfp" in name: # TODO all operations need this
if args.tune_dense_custom_schedule:
print('use dense custom schedule for tuning')
has_schedule = True
sch = tvm.tir.Schedule(mod_linear)
sched_func = lambda _: pfp_dense(sch, stochastic=args.tune_dense_stochastic, func_name="main", name = "".join([i for i in name if not i.isdigit()]),
reorder=args.tune_dense_reorder, vectorize=args.tune_dense_vectorize, unroll=args.tune_dense_unroll, parallel=args.tune_dense_parallel, blocking=args.tune_dense_blocking, packed=args.tune_dense_packed)
print('map dense...')
else:
print('use dense MetaScheduler for tuning')
elif "_pool" in name:
if not args.tune_lenet_pool:
continue ### required for tuning without pool, tuning PFP LeNet performs better without pool tuning!
print(f'scheduler tune config: max_trials_global = {max_trials_global}, num_trials_per_iter = {num_trials_per_iter}')
database = ms.tune_tir(
mod=mod_linear,
target=str(target) + " -num-cores 64",
#target=str(target) + " -num-cores 1",
max_trials_global=max_trials_global,
num_trials_per_iter=num_trials_per_iter,
runner=ms.runner.RPCRunner(ms.runner.RPCConfig(tracker_host="localhost", tracker_port=9000, tracker_key=device_key, session_timeout_sec=120)),
space=ms.space_generator.ScheduleFn(sched_func) if has_schedule else "post-order-apply",
work_dir=work_dir,
)
sch = ms.tir_integration.compile_tir(database, mod_linear, target)
if sch is not None:
new_func = sch.mod["main"].with_attr("global_symbol", name)
db_mod = tvm.IRModule.from_expr(new_func)
database.commit_workload(db_mod)
MyModule.update_func(task, new_func)
else:
print("No Schedule found for ", name)
return MyModule
def load_tuning(MyModule, target, work_dir = "./tune_tmp", print_trace = False):
database = ms.Database.create("json", work_dir=work_dir, module_equality="structural")
tasks = MyModule.get_global_vars()
for task in tasks:
name = task.name_hint
print(f"task = {name}")
if name == "main":
continue
mod_linear = tvm.IRModule.from_expr(MyModule[name].with_attr("global_symbol", "main"))
sch = ms.tir_integration.compile_tir(database, mod_linear, target)
if print_trace:
print(sch.trace)
if sch is not None:
print("Schedule found for:", name)
new_func = sch.mod["main"].with_attr("global_symbol", name)
MyModule.update_func(task, new_func)
else:
print("No Schedule found for ", name)
return MyModule
###################### SCHEDULES ######################
"""
Schedules for MM-like operations exist based on:
packed / non-packed: access pattern to the second operand
stochastic/deterministic: stochastic schedules work as templates for the "AutoTuner"-MetaSchedule, deterministic schedules can be apllied directly with manual choosen parameters
Optimized: apply all optimizations
Note: all optimized schedules here can be created out of pfp_dense, current implementation is for readability
"""
def pfp_dense(sch: tvm.tir.Schedule, func_name="main", stochastic=True, bn=16, factor=4, name="dense_pfp", reorder=True, vectorize=True, unroll=True, parallel=True, blocking=False, packed=False):
"""
Flexible Schedule for a dense PFP operator
sch: base schedule, created directly from module
block: if emebedded in Relax, the relevant block must be provided, else just use None
stochastic: true -> generate stochastic schedule, false -> generate deterministic schedule
bn: blocking factor, irrelevant if stochatic
factor: split factor of redution loop, determines unroll factor, irrelevant if stochastic
name: name of the operation, this schedules works for matrix multicplication like computations (tested: dense/ dense_pfp)
reorder: whether loops are reordered
vectorize: vectorize the inner most loop (must be a spatial loop)
unroll: unroll the second inner most loop
parallel: apply parallelization to the outer most loop
blocking: block access to matrix A into blocks of size "bn"
packed: reorder access to matrix B into packs
"""
print('FOUND PFP DENSE TUNING SCHEDULE')
if packed:
name += "_packed"
try:
block_dense = sch.get_block(name, func_name=func_name)
except:
name+="_first_layer"
block_dense = sch.get_block(name, func_name=func_name)
i, j, k = sch.get_loops(block_dense)
if blocking:
if stochastic:
factors_reduction = sch.sample_partitioned_tile(loop=k, n=2)
factors_outer = sch.sample_partitioned_tile(loop=i, n=2)
factors_inner = sch.sample_partitioned_tile(loop=j, n=2)
else:
factors_reduction = [None, bn]
factors_outer = [None, bn]
factors_inner = [None, factor]
i0, i1 = sch.split(i, factors=factors_reduction)
j0, j1 = sch.split(j, factors=factors_outer)
k0, k1 = sch.split(k, factors=factors_inner)
if reorder:
sch.reorder(i0, j0, k0, i1, k1, j1)
if vectorize:
sch.vectorize(j1)
if unroll:
sch.unroll(k1)
if parallel:
sch.parallel(i0)
else:
if reorder:
sch.reorder(i, j, k)
if vectorize:
sch.vectorize(j)
if unroll:
sch.unroll(j)
if parallel:
sch.parallel(i)
if packed:
if "pfp" in name: # pfp_dense
block_A = sch.get_block("packedWM", func_name=func_name)
bigN, _, littleN = sch.get_loops(block_A)
sch.vectorize(littleN)
sch.parallel(bigN)
block_B = sch.get_block("packedWS", func_name=func_name)
bigN, _, littleN = sch.get_loops(block_B)
sch.vectorize(littleN)
sch.parallel(bigN)
else: # dense (regular)
block_A = sch.get_block("packedW", func_name=func_name)
bigN, _, littleN = sch.get_loops(block_A)
sch.vectorize(littleN)
sch.parallel(bigN)
return sch