Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
417 commits
Select commit Hold shift + click to select a range
1b4810f
optimized add_nv_gpu
Ziminli Oct 18, 2024
3f43351
mlu argmax
xgqdut2016 Oct 18, 2024
639e685
rearrange
Oct 18, 2024
b5723f6
fix format
Oct 18, 2024
5bc4dc4
rms_norm
Oct 18, 2024
b38e3c0
bang argmax
xgqdut2016 Oct 18, 2024
97a8a70
Add fp32 support
Ziminli Oct 18, 2024
cf335a9
Merge pull request #82 from PanZezhong1725/randomSample
PanZezhong1725 Oct 21, 2024
11ccfe1
randomSample
xgqdut2016 Oct 21, 2024
847f494
Merge pull request #86 from PanZezhong1725/randomSample
PanZezhong1725 Oct 21, 2024
a9aec43
Enhanced fp16 and fp32 performance by applying better block size and …
Ziminli Oct 22, 2024
a292a7c
add swiglu support fp32/fp16
Oct 24, 2024
e6e50c2
Add support for fp32
Ziminli Oct 24, 2024
ecc0930
Merge pull request #76 from PanZezhong1725/add_fp32
PanZezhong1725 Oct 24, 2024
bb5bd91
Add relu to infini_operators.h
Ziminli Oct 24, 2024
abd5fef
Add Conv CPU and CUDA implementation
Ziminli Sep 23, 2024
850af57
Use dt mapping for cuda data types, switched dtype_eq to operator=
Ziminli Sep 25, 2024
ba772da
Add fp32 support
Ziminli Oct 18, 2024
b13a2b1
Add conv to infini_operators.h
Ziminli Oct 24, 2024
fb78256
Merge branch 'add_conv' of github.com:PanZezhong1725/operators into a…
Ziminli Oct 24, 2024
83edbcf
Add ReLU CPU and CUDA implementation
Ziminli Sep 24, 2024
7d33849
Remove unused half2 operators, specialize half2 relu assignment, more…
Ziminli Sep 24, 2024
0db2bf7
Add support for fp32
Ziminli Oct 24, 2024
31b629e
Add relu to infini_operators.h
Ziminli Oct 24, 2024
7211bb9
Merge branch 'add_relu' of github.com:PanZezhong1725/operators into a…
Ziminli Oct 24, 2024
cd68694
Remove cudaDeviceSynchronize, fixed 1D issue
Ziminli Oct 25, 2024
e0790f3
Add explanation for vecN template types
Ziminli Oct 25, 2024
947de53
Remove device_id from the create descriptor interface and other misc.
Ziminli Oct 25, 2024
ee31879
fp16 and fp32 support for global avg pool (initial commit)
Ziminli Oct 28, 2024
fcdc5f5
Add global_avg_pool into infini_operators.h
Ziminli Oct 28, 2024
7e73787
fix 3d support
Oct 30, 2024
fb0ed24
fix status check
Oct 30, 2024
96d4b25
fix check status
Oct 30, 2024
7eaac75
fix check_ret
Oct 30, 2024
a3630ff
delete unused comment
Oct 30, 2024
da1b385
CI: time each script
PanZezhong1725 Oct 30, 2024
ac87179
Merge pull request #83 from PanZezhong1725/ascend-matmul
PanZezhong1725 Oct 30, 2024
f412a7a
CI: print in minutes and time
PanZezhong1725 Oct 30, 2024
4578b13
Merge pull request #91 from PanZezhong1725/ci_time
PanZezhong1725 Oct 30, 2024
6e0084a
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Oct 30, 2024
531784d
fix check_ret
Oct 30, 2024
cc87e26
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Oct 30, 2024
83acb0f
Add Expand operator
Ziminli Oct 31, 2024
65724c1
Add fp32 support for matmul, move getDstIndex to common utils
Ziminli Oct 31, 2024
ab31840
Add GEMM operator
Ziminli Oct 31, 2024
6a24230
Merge branch 'dev' into add_gemm
Ziminli Oct 31, 2024
3203a83
Allow Expand opeartor to handle noncontiguous data
Ziminli Oct 31, 2024
1df069c
Merge branch 'add_gemm' of github.com:PanZezhong1725/operators into a…
Ziminli Oct 31, 2024
0e93753
Add 3D GEMM test case
Ziminli Oct 31, 2024
d7365b5
Remove cudaDeviceSynchronize() in matmul.cu
Ziminli Nov 1, 2024
851154d
ascend-rope
Nov 1, 2024
9f76840
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Nov 1, 2024
347559f
add comment
Nov 1, 2024
60e0574
Merge pull request #84 from PanZezhong1725/ascend-rearrange
PanZezhong1725 Nov 1, 2024
a4954e4
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Nov 1, 2024
6458016
fix format
Nov 1, 2024
d15df15
fix format
Nov 1, 2024
6e06b31
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Nov 1, 2024
2912482
Merge pull request #78 from PanZezhong1725/bangRMS
PanZezhong1725 Nov 1, 2024
1ae05c1
delete handle in Descriptor
Nov 1, 2024
ff7b8a0
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Nov 1, 2024
8a8a5a2
mv aclnnGetworkspace to createDescriptor
Nov 1, 2024
b727971
fix bug
Nov 1, 2024
ea8dbea
Change util function names
Ziminli Nov 1, 2024
a1ba6a5
mv aclnnGetWorkpace to aclnnCreateMatmulDescriptor
Nov 4, 2024
c9151a4
mv getworkspaceSize to aclnnCreateDescriptor
Nov 4, 2024
753fcf7
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Nov 4, 2024
085c01f
Merge pull request #85 from PanZezhong1725/ascend-rms_norm
PanZezhong1725 Nov 4, 2024
bfc9bd1
clean_up: delete depricated codes
PanZezhong1725 Nov 4, 2024
50c03fd
Merge pull request #98 from PanZezhong1725/clean_up
PanZezhong1725 Nov 4, 2024
ecf733f
mv aclnnGetWorkspaceSize to createOpDescriptor
Nov 4, 2024
9b32ac8
fix format and add device space free
Nov 4, 2024
a5a9143
fix handle in Descriptor
Nov 4, 2024
dcafa15
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Nov 4, 2024
2dfcf83
Merge pull request #96 from PanZezhong1725/ascend-matmul
PanZezhong1725 Nov 4, 2024
084565b
Merge pull request #97 from PanZezhong1725/ascend-rearrange
PanZezhong1725 Nov 4, 2024
c6203d5
Merge pull request #99 from PanZezhong1725/ascend-causal-softmax
PanZezhong1725 Nov 4, 2024
1fe5d07
Make c tensor optional, change GEMM CUDA fp32 compute type, merge cud…
Ziminli Nov 4, 2024
45087e4
Enhanced algorithm selection and f16 conv op data type selection, add…
Ziminli Nov 4, 2024
c8c115a
Add cudaDeviceProp and compute capability numbers into cuda handle
Ziminli Nov 5, 2024
b1edcc1
Add cudaDeviceProp and compute capability numbers into the cuda handl…
Ziminli Nov 5, 2024
3b532a8
fix: softmax remove tensor
PanZezhong1725 Nov 5, 2024
6e81a88
Merge pull request #100 from PanZezhong1725/fix_tensor
PanZezhong1725 Nov 5, 2024
19ccf17
delete cnnl rope and swiglu
xgqdut2016 Nov 5, 2024
958aca1
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Nov 5, 2024
6fff6c5
Add properties into cuda handle, clean the code
Ziminli Nov 5, 2024
514cc27
Add cudaDeviceProp and compute capability numbers into cuda handle
Ziminli Nov 5, 2024
183a5fd
fix mlp
kilinchange Oct 22, 2024
668c8d4
fix attention
kilinchange Nov 5, 2024
62d14b1
Change rtol, test with profiling
Ziminli Nov 5, 2024
e9f3ec2
Add omp optimization to cpu and add profiling in test
Ziminli Nov 5, 2024
fc745e0
Separate rtol for fp16 and other cases
Ziminli Nov 5, 2024
4dd989e
Fixed openmp parallelization for applyConv
Ziminli Nov 5, 2024
3a7bafb
dim<=32
Nov 5, 2024
0cb4203
Merge pull request #61 from PanZezhong1725/add_conv
PanZezhong1725 Nov 5, 2024
49ee9f2
Merge pull request #68 from PanZezhong1725/add_relu
PanZezhong1725 Nov 6, 2024
ef2e741
Merge branch 'dev' into add_gemm
Ziminli Nov 6, 2024
d629df9
delete handle_pool.h .c
xgqdut2016 Nov 6, 2024
3d892ee
Merge branch 'dev' into add_global_avg_pool
Ziminli Nov 6, 2024
8c4d1aa
fix: CpuRearrangeDescriptor
bitzyz Nov 6, 2024
4a7075f
Merge pull request #80 from PanZezhong1725/bangRoPE
PanZezhong1725 Nov 6, 2024
17ce764
Merge pull request #104 from PanZezhong1725/fix-rearrange
PanZezhong1725 Nov 6, 2024
c32d37d
fix(attn): remove new
kilinchange Nov 6, 2024
9e976bd
Add checkCudaErrorWithCode to cudaDestroyDescriptor() for add and expand
Ziminli Nov 6, 2024
82de992
Merge branch 'add_gemm' of github.com:PanZezhong1725/operators into a…
Ziminli Nov 6, 2024
2ed25c0
Add cuDNN implementation for ndim==5, add profiling to frontend test,…
Ziminli Nov 6, 2024
1e74a5f
Merge pull request #93 from PanZezhong1725/add_gemm
PanZezhong1725 Nov 6, 2024
5e26117
Merge branch 'dev' into add_global_avg_pool
Ziminli Nov 6, 2024
fdbf030
Merge pull request #89 from PanZezhong1725/add_global_avg_pool
PanZezhong1725 Nov 6, 2024
decdc98
feat: 修改华为手写算子编译流程
PanZezhong1725 Nov 12, 2024
ea143a6
fix: handle unsupported dtype
PanZezhong1725 Nov 12, 2024
516414f
Merge branch 'dev' into ascend-rope
PanZezhong1725 Nov 12, 2024
813208e
Merge pull request #101 from PanZezhong1725/fix_mlp
PanZezhong1725 Nov 12, 2024
4e7b278
fix: 删除重复代码,修复merge错误
PanZezhong1725 Nov 13, 2024
ae7791e
Merge pull request #90 from PanZezhong1725/ascend-rope
PanZezhong1725 Nov 13, 2024
ffeb920
refactor: 添加install项目功能
PanZezhong1725 Nov 13, 2024
d61b282
fix ascend matmul
Nov 13, 2024
0d08a44
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Nov 13, 2024
81e7370
fix CI
PanZezhong1725 Nov 13, 2024
f31da48
Merge pull request #107 from PanZezhong1725/install
PanZezhong1725 Nov 13, 2024
9bed572
fix bug
Nov 13, 2024
20da14c
fix bug
Nov 13, 2024
3505123
Merge pull request #108 from PanZezhong1725/ascend-matmul-fix
PanZezhong1725 Nov 13, 2024
283aa3e
fix matmul test
Nov 14, 2024
b0f9f62
Merge pull request #109 from PanZezhong1725/fix-matmul-test
PanZezhong1725 Nov 14, 2024
d0dd9a3
fix(matmul): fix cpu matmul
kilinchange Nov 15, 2024
84faf9e
fix: 华为matmul支持batch
PanZezhong1725 Nov 18, 2024
5f20d3f
Merge pull request #112 from PanZezhong1725/fix_cpu_matmul
PanZezhong1725 Nov 18, 2024
352b9d8
fix: use setDescriptor in rms norm
PanZezhong1725 Nov 18, 2024
2c9c4b6
fix: add prerun for pytorch matmul
PanZezhong1725 Nov 18, 2024
1cb9c15
Merge pull request #113 from PanZezhong1725/ascend_gemm
zhangyue207 Nov 18, 2024
f7f2462
fix: 修改infer storage shape
PanZezhong1725 Nov 19, 2024
33ace8a
fix: 考虑最后一维不连续,以及stride是0的情况
PanZezhong1725 Nov 19, 2024
c245e8d
Add Pooling (CUDA)
Ziminli Nov 4, 2024
9afc5ee
Separate avg pool and max pool and completed CPU implementation
Ziminli Nov 4, 2024
bf1d5ca
Add CUDA support for 4D-8D input
Ziminli Nov 5, 2024
c6b0293
Moved common utility functions for conv and pooling to common_cpu
Ziminli Nov 5, 2024
7a879b7
Add Pooling (CUDA)
Ziminli Nov 4, 2024
96dee4a
Separate avg pool and max pool and completed CPU implementation
Ziminli Nov 4, 2024
9b7ef24
Add CUDA support for 4D-8D input
Ziminli Nov 5, 2024
727c3e3
Remove pooling bang
Ziminli Nov 5, 2024
c4f4b3e
Add profiling in tests, add max_pool and avg_pool into infini_operato…
Ziminli Nov 6, 2024
9593cb1
Mark Cambricon sections to TODO
Ziminli Nov 6, 2024
1e477e4
Changed pooling signature, moved pooling.h to src/ops/pooling
Ziminli Nov 6, 2024
7528578
Fix merge issues
Ziminli Nov 6, 2024
3bc18a2
Merge pull request #114 from PanZezhong1725/ascend_set_tensor
zhangyue207 Nov 20, 2024
2527514
fix: 昇腾infer storage shape按1D数组
PanZezhong1725 Nov 20, 2024
6ba333e
Merge pull request #117 from PanZezhong1725/ascend_storage_shape
zhangyue207 Nov 20, 2024
4248519
hack ascend random sample as argmax
Nov 21, 2024
e6947e3
fix format
Nov 21, 2024
b28909e
recover single test
Nov 21, 2024
66e54c8
Merge pull request #118 from PanZezhong1725/hack-ascend-random-sample
PanZezhong1725 Nov 21, 2024
4754117
test: 增加llama中的rearrange实例
PanZezhong1725 Nov 21, 2024
7de54c4
Add delete desc, remove copyF32DataToF16
Ziminli Nov 19, 2024
7fe149a
Separate avg pool and max pool and completed CPU implementation
Ziminli Nov 4, 2024
0003b69
Changed pooling signature, moved pooling.h to src/ops/pooling
Ziminli Nov 5, 2024
4ed0629
Fix merge issues
Ziminli Nov 6, 2024
aee71ee
Remove new in max pool and avg pool
Ziminli Nov 20, 2024
a7707a0
Merge pull request #95 from PanZezhong1725/add_pooling
PanZezhong1725 Nov 22, 2024
094fdf4
temp: 将rearrange准备工作全部移至计算内(等华为setTensorAddr修复)
PanZezhong1725 Nov 25, 2024
f1eb144
Merge pull request #121 from PanZezhong1725/ascend_rearrange_tmp
PanZezhong1725 Nov 25, 2024
aa5c06d
Enhance error-handling macros to display error line numbers and relat…
kilinchange Nov 22, 2024
d9effae
modified according to the reviewer's comments
kilinchange Nov 26, 2024
eb3546b
Merge pull request #120 from PanZezhong1725/fix_error_handling
PanZezhong1725 Nov 28, 2024
74a52b6
fix: 修改xmake build和install的提示信息
PanZezhong1725 Dec 4, 2024
765cc56
Merge pull request #125 from PanZezhong1725/xmake_fix
PanZezhong1725 Dec 4, 2024
5ca7ccd
add ascend random sample
Dec 4, 2024
360dee3
fix random_sample test
Dec 4, 2024
8fbcfe7
modify xmake.lua and main.yaml
kilinchange Dec 4, 2024
70aaae1
update README
kilinchange Dec 4, 2024
09e3b0b
fix bug
Dec 4, 2024
588f6e3
remove os.setenv
kilinchange Dec 4, 2024
92e2d06
Merge pull request #126 from PanZezhong1725/update_installation_process
PanZezhong1725 Dec 5, 2024
2ef577c
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Dec 5, 2024
2c30a13
delete rms cnnl
xgqdut2016 Dec 5, 2024
0ab53f1
Merge pull request #128 from PanZezhong1725/rms_delete
PanZezhong1725 Dec 5, 2024
171418f
fix ub overflow
Dec 5, 2024
f33f6a2
fix: fix for windows
YdrMaster Dec 6, 2024
c15ce20
rm old randomsample
Dec 9, 2024
65111d9
Merge branch 'dev' of github.com:PanZezhong1725/operators into ascend…
Dec 9, 2024
3d11a03
delete old infer storage shape code
Dec 9, 2024
a6e5e2b
add infiniOp DT to aclDataType transfer func
Dec 9, 2024
a30ca26
Merge pull request #130 from YdrMaster/main
PanZezhong1725 Dec 9, 2024
43e65e8
fix randomsample new defination
Dec 9, 2024
c1d21a7
modifed utils warning
xgqdut2016 Dec 10, 2024
fa8bbeb
fix: 更改seed的边界值处理
PanZezhong1725 Dec 10, 2024
275ace2
modified utils.h
xgqdut2016 Dec 10, 2024
55ac2b6
Merge pull request #127 from PanZezhong1725/ascend-random-sample
PanZezhong1725 Dec 10, 2024
1838557
Merge pull request #131 from PanZezhong1725/utils_warn
PanZezhong1725 Dec 10, 2024
16abf98
Change dynamic stack array to std::vector
Ziminli Dec 10, 2024
380f65c
fix: fix every error for cuda on windows
YdrMaster Dec 11, 2024
5a67281
Merge pull request #132 from YdrMaster/main
PanZezhong1725 Dec 16, 2024
ff54fb1
fix: 修复cuda代码中的类型错误
PanZezhong1725 Dec 16, 2024
745a4b8
Merge pull request #133 from PanZezhong1725/fix_cuda_
PanZezhong1725 Dec 16, 2024
9f6b19d
fix(cpu): 为 rearrange 支持 ndim == 1
YdrMaster Dec 16, 2024
f28604e
Merge pull request #134 from YdrMaster/main
PanZezhong1725 Dec 17, 2024
0200c75
fix(nv): 为 rearrange 支持 ndim == 1
YdrMaster Dec 17, 2024
a03df4e
fix: 所有 unsigned long int 替换为 uint64_t
YdrMaster Dec 17, 2024
d44beb0
fix(nv): 改正 rearrange
YdrMaster Dec 17, 2024
9214a89
Merge pull request #135 from YdrMaster/main
PanZezhong1725 Dec 18, 2024
f43df84
fix: random sample测试使用确定的分布
PanZezhong1725 Dec 18, 2024
f74eca1
Merge pull request #136 from PanZezhong1725/random_sample_test
PanZezhong1725 Dec 18, 2024
4ed33fe
fix: add set device id for cuda rope ang swiglu
PanZezhong1725 Dec 20, 2024
b58fe18
Merge pull request #137 from PanZezhong1725/rope_set_device
PanZezhong1725 Dec 20, 2024
588ebff
fix:解决寒武纪不支持64计算的问题
xgqdut2016 Jan 8, 2025
6c66f4b
fix: 寒武纪rearrange支持1维张量
PanZezhong1725 Jan 9, 2025
a72d562
Merge pull request #141 from PanZezhong1725/fix_bang_rearrange
PanZezhong1725 Jan 10, 2025
2bfa8cd
Fix deep copy info issues for add_cpu, conv_cpu, and pooling_cpu
Ziminli Jan 10, 2025
accf4ee
Merge pull request #142 from PanZezhong1725/fix_op_info_deep_copy
PanZezhong1725 Jan 10, 2025
0d65a49
add invalidate func for TensorDescriptor in python
kilinchange Jan 13, 2025
3ff397e
fix random_sample: torch uniformly generates int64 type data
kilinchange Jan 13, 2025
155fba1
Merge pull request #143 from PanZezhong1725/fix_test
PanZezhong1725 Jan 13, 2025
0e7b816
feat: 统一测试框架的profile流程
PanZezhong1725 Jan 13, 2025
7bd1aa2
Merge pull request #145 from PanZezhong1725/add_profile
PanZezhong1725 Jan 14, 2025
452edbd
fix: 寒武纪调库算子增加同步
PanZezhong1725 Jan 15, 2025
7062ef4
Merge pull request #146 from PanZezhong1725/fix_cnnl
PanZezhong1725 Jan 20, 2025
bf08475
success debug bang causal softmax
xgqdut2016 Jan 20, 2025
dc10e20
add code introduction
xgqdut2016 Jan 20, 2025
d7bda2a
Merge pull request #148 from PanZezhong1725/debug_causal_softmax
PanZezhong1725 Jan 20, 2025
0353a33
fix: mlu random sample
xgqdut2016 Feb 5, 2025
859e446
Device增加沐曦(HC)
qinyiqun Dec 24, 2024
e9e35f9
沐曦添加 matmul
qinyiqun Dec 24, 2024
a39f2b8
沐曦增加 swiglu
qinyiqun Dec 24, 2024
3b0b1ab
沐曦增加 rope
qinyiqun Dec 24, 2024
15eedcf
沐曦增加 rms_norm
qinyiqun Dec 24, 2024
fbf5d76
沐曦增加 rearrange
qinyiqun Dec 24, 2024
d9dd301
沐曦增加 random_sample
qinyiqun Dec 24, 2024
762678e
沐曦增加 causal softmax
qinyiqun Dec 24, 2024
9f58b82
添加DEVICE枚举信息
qinyiqun Jan 10, 2025
7cf84bf
fix maca
qinyiqun Feb 6, 2025
2bd55da
Merge pull request #152 from PanZezhong1725/maca
qinyiqun Feb 10, 2025
48aad8b
曙光:支持DCU推理
PanZezhong1725 Feb 10, 2025
02970cb
fix: cublas matmul fp16使用f32计算方式
PanZezhong1725 Feb 10, 2025
93db3ad
Merge pull request #151 from PanZezhong1725/sugon_dcu
PanZezhong1725 Feb 10, 2025
f736704
Device 增加摩尔线程
qinyiqun Nov 29, 2024
49ee1e3
添加摩尔线程 MatMul 算子
qinyiqun Nov 29, 2024
b14587b
添加摩尔线程 Causal_softmax 算子
qinyiqun Nov 29, 2024
6e84da6
添加摩尔线程 rearrange 算子
qinyiqun Nov 29, 2024
c5bc281
添加摩尔线程 rms_norm 算子
qinyiqun Nov 29, 2024
8bd132f
摩尔线程添加 Rope 算子
qinyiqun Nov 29, 2024
4522473
摩尔线程添加 swiglu 算子
qinyiqun Nov 29, 2024
329ca21
添加摩尔线程 random sample 算子
qinyiqun Nov 29, 2024
19565fb
摩尔:setdevice之前进行判断
qinyiqun Nov 29, 2024
37c4f54
摩尔线程添加 Add 算子
qinyiqun Dec 5, 2024
251ca48
摩尔线程添加 expand 算子
qinyiqun Dec 5, 2024
1c142a8
摩尔线程添加 relu 算子
qinyiqun Dec 5, 2024
862e4f2
增加对mudnn的支持
qinyiqun Jan 9, 2025
bac08e9
rebase dev
qinyiqun Feb 7, 2025
c9ade4d
fix format and rebase dev
qinyiqun Feb 10, 2025
a560438
Merge pull request #153 from qinyiqun/musa
PanZezhong1725 Feb 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
name: CI

on:
push:
branches:
- main
- dev
pull_request:


jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Install Python
uses: actions/setup-python@v4
with:
python-version: '3.x'

- name: Install Python dependencies
run: |
pip install numpy
pip install torch

- name: Install xmake
uses: xmake-io/github-action-setup-xmake@v1
with:
xmake-version: latest

- name: configure xmake
run: xmake f --cpu=true -cv

- name: Set INFINI_ROOT
run: |
export INFINI_ROOT=$GITHUB_WORKSPACE/.infini
mkdir -p $INFINI_ROOT
echo "INFINI_ROOT=$INFINI_ROOT" >> $GITHUB_ENV

- name: Build with XMake
run: xmake build && xmake install

- name: Run Python Tests
run: |
GREEN='\033[0;32m'
RED='\033[0;31m'
NC='\033[0m' # No Color

PASSED_TESTS=()
FAILED_TESTS=()
for script in operatorspy/tests/*.py; do
if [ "$(basename $script)" != "__init__.py" ] && [ "$(basename $script)" != "test_utils.py" ]; then
echo "Running $script"
START_TIME=$(date +%s)
if ! python3 $script --cpu; then
echo "$script failed"
FAILED_TESTS+=($script)
else
echo "$script passed"
PASSED_TESTS+=($script)
fi
END_TIME=$(date +%s)
DURATION=$(( END_TIME - START_TIME ))
MINUTES=$(( DURATION / 60 ))
SECONDS=$(( DURATION % 60 ))
echo "Execution time for $script: ${MINUTES}m ${SECONDS}s"
fi
done

if [ ${#FAILED_TESTS[@]} -ne 0 ]; then
echo "The following tests passed:"
for test in "${PASSED_TESTS[@]}"; do
echo -e "${GREEN}$test${NC}"
done
echo "The following tests failed:"
for test in "${FAILED_TESTS[@]}"; do
echo -e "${RED}$test${NC}"
done
exit 1
else
echo "The following tests passed:"
for test in "${PASSED_TESTS[@]}"; do
echo -e "${GREEN}$test${NC}"
done
echo "${GREEN}All tests passed${NC}"
fi
env:
INFINI_ROOT: ${{ env.INFINI_ROOT }}
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,13 @@ __pycache__/

# Lib
lib/
out/

# Log
*.log

# Cache
cache/

# Json
*.json
91 changes: 72 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,30 +1,77 @@
# 算子库
# InfiniOperators 算子库

跨平台高性能通用算子库。形式为 C 接口动态库。
跨平台高性能统一算子库。形式为 C 接口动态库。

采用二段式算子设计,每个算子都实现并对外暴露以下的 C 接口:
## 简介

- 第一阶段:构造算子 Descriptor。用户提供的算子名称、硬件、以及算子配置(如计算的数据类型、计算排布等),相应模组会被 load 到硬件上。
### 算子接口设计

采用3+1段式算子设计,每个算子都实现并对外暴露以下的 C 接口:

- 第一阶段:构造硬件控柄(Handle)。用户提供控柄地址、硬件类型以及硬件序号。控柄所在的内存空间由用户管理。

```C
void* createOpDescriptor(Device, void *config);
infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, int device, int device_id);
```

- 第二阶段:计算。根据一阶段的 Descriptor,执行相应计算,用户需要提供输入输出张量,以及硬件计算流(CPU 为 NULL)
- 第二阶段:构造算子描述(Descriptor)。用户提供描述符地址、硬件控柄、以及算子涉及的张量描述(含张量数据类型、形状和步长)。这一步会完成算子所需的与张量数据无关的预计算

```C
void op(void *descriptor, Tensor output, Tensor input, void *stream);
infiniopStatus_t infiniopCreateOpDescriptor(infiniopHandle_t handle, infiniopOpDescriptor_t *desc_ptr, infiniopTensorDescriptor_t t, ...);
```

- 销毁 Descriptor
- 第三阶段(可选):计算额外工作空间。根据算子描述,计算算子所需的额外工作空间大小,并存储于用户提供的位置。具体空间分配由用户负责

```C
void destroyOpDescriptor(void *descriptor);
infiniopStatus_t infiniopGetOpWorkspaceSize(infiniopOpDescriptor_t desc, uint64_t *size);
```

- 第四阶段:计算。根据算子描述符,在指定的硬件上执行相应计算,用户需要提供输入输出的数据,以及硬件计算流(CPU 为 NULL)。

```C
infiniopStatus_t infiniopGetOp(infiniopOpDescriptor_t desc, [void *workspace, uint64_t workspace_size,] void *output_data, void *input_data, ..., void *stream);
```

- 销毁描述和硬件控柄。

```C
infiniopStatus_t infiniopDestroyOpDescriptor(infiniopOpDescriptor_t desc);
infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
```

### 张量(Tensor)描述设计

张量描述由以下几个部分组成:

1.数据类型,由打包大小(即一个元素代表几个数据)、符号位、元素大小、尾数位数、指数位数共4字节表示。定义如下:

```C
typedef struct DataLayout {
unsigned short
packed : 8,
sign : 1,
size : 7,
mantissa : 8,
exponent : 8;
} DataLayout;
```

2.维度信息。张量有多少个维度。类型为uint64_t。

3.张量形状。张量每个维度的大小。类型为uint64_t*。

4.张量步长。张量每个维度的步长。类型为uint64_t*。

创建和销毁张量描述符的接口:

```C
infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, DataLayout layout, uint64_t ndim, uint64_t *shape, uint64_t *strides);
infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
```

## 一、使用说明

### 配置
### 1. 配置

#### 查看当前配置

Expand Down Expand Up @@ -52,23 +99,27 @@ xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
xmake f --cambricon-mlu=true -cv
```

### 编译
#### 配置 NPU

````xmake
xmake f --ascend-npu=true -cv
````

### 2. 编译安装

```xmake
xmake
xmake build && xmake install
```

### 将编译好的算子库添加至环境变量 `INFINI_ROOT`
### 3. 设置环境变量

```bash
export INFINI_ROOT=[PATH_TO_LIBRARY]
```
按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。

### 运行算子测试
### 4. 运行算子测试

```bash
cd operatorspy/tests
python operator_name.py
python operator_name.py [--cpu | --cuda | --cambricon | --ascend]
```

## 二、开发说明
Expand All @@ -82,6 +133,8 @@ python operator_name.py
│   │   ├── [operator_name].h # 对外暴露的算子 C 接口定义,descriptor 定义
│   ├── tensor
│   │   ├── tensor_descriptor.h # 对外暴露的张量 descriptor 定义
│   ├── handle
│   │   ├── handle_export.h # 对外暴露的硬件 handle 定义
│   ├── *.h # 对外暴露的核心结构体定义
├── src
│   ├── devices
Expand All @@ -105,7 +158,7 @@ python operator_name.py

- 在 `src/device.h` 和 `operatorspy/devices.py` 中增加新的硬件类型,注意两者需要一一对应;
- 在 `xmake.lua` 中增加新硬件的编译选项以及编译方式;
- 在 `src/ops/devices/[device_name]` 下编写特定硬件的通用代码
- 在 `src/ops/devices/[device_name]` 下编写特定硬件的handle实现和通用代码
- 实现该硬件的算子;

### 增加新的算子
Expand Down
20 changes: 20 additions & 0 deletions include/data_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,28 @@ typedef struct DataLayout {
size : 7,
mantissa : 8,
exponent : 8;

#ifdef __cplusplus
bool operator==(const DataLayout &other) const {
union TypePun {
DataLayout layout;
unsigned int i;
} pun;
pun.layout = *this;
auto a_ = pun.i;
pun.layout = other;
auto b_ = pun.i;
return a_ == b_;
}

bool operator!=(const DataLayout &other) const {
return !(*this == other);
}
#endif
} DataLayout;

typedef struct DataLayout DT;

// clang-format off
const static struct DataLayout
I8 = {1, 1, 1, 7, 0},
Expand Down
11 changes: 8 additions & 3 deletions include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@
#define __DEVICE_H__

enum DeviceEnum {
DevCpu,
DevNvGpu,
DevCambriconMlu,
DevCpu = 0,
DevNvGpu = 1,
DevCambriconMlu = 2,
DevAscendNpu = 3,
DevMetaxGpu = 4,
DevMthreadsGpu = 5,
};

typedef enum DeviceEnum Device;

#endif// __DEVICE_H__
12 changes: 12 additions & 0 deletions include/handle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef INFINIOP_HANDLE_H
#define INFINIOP_HANDLE_H

#include "device.h"

typedef struct HandleStruct {
Device device;
} HandleStruct;

typedef HandleStruct *infiniopHandle_t;

#endif
12 changes: 12 additions & 0 deletions include/handle/handle_export.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef INFINIOP_HANDLE_EXPORT_H
#define INFINIOP_HANDLE_EXPORT_H
#include "../status.h"
#include "../handle.h"
#include "../export.h"
#include "../device.h"

__C __export infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id);

__C __export infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle);

#endif // INFINIOP_HANDLE_EXPORT_H
14 changes: 13 additions & 1 deletion include/infini_operators.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
#include "handle/handle_export.h"
#include "ops/add/add.h"
#include "ops/attention/attention.h"
#include "ops/avg_pool/avg_pool.h"
#include "ops/causal_softmax/causal_softmax.h"
#include "ops/global_avg_pool/global_avg_pool.h"
#include "ops/expand/expand.h"
#include "ops/gemm/gemm.h"
#include "ops/conv/conv.h"
#include "ops/matmul/matmul.h"
#include "ops/reform/reform.h"
#include "ops/max_pool/max_pool.h"
#include "ops/mlp/mlp.h"
#include "ops/random_sample/random_sample.h"
#include "ops/rearrange/rearrange.h"
#include "ops/relu/relu.h"
#include "ops/rms_norm/rms_norm.h"
#include "ops/rotary_embedding/rotary_embedding.h"
#include "ops/swiglu/swiglu.h"
Expand Down
6 changes: 2 additions & 4 deletions include/operators.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#ifndef __OPERATORS_H__
#define __OPERATORS_H__

#include "data_type.h"
#include "device.h"
#include "tensor.h"

typedef enum DeviceEnum Device;
typedef struct DataLayout DT;
#include "handle.h"
#include "status.h"

#endif// __OPERATORS_H__
27 changes: 27 additions & 0 deletions include/ops/add/add.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef ADD_H
#define ADD_H

#include "../../export.h"
#include "../../operators.h"

typedef struct AddDescriptor {
Device device;
} AddDescriptor;

typedef AddDescriptor *infiniopAddDescriptor_t;

__C __export infiniopStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
infiniopAddDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);

__C __export infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
void *c,
void const *a,
void const *b,
void *stream);

__C __export infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);

#endif
Loading