Skip to content

[NPU]:Added support for the dyt operator#1124

Open
TianHao324 wants to merge 2 commits intolinkedin:mainfrom
TianHao324:dyt
Open

[NPU]:Added support for the dyt operator#1124
TianHao324 wants to merge 2 commits intolinkedin:mainfrom
TianHao324:dyt

Conversation

@TianHao324
Copy link
Contributor

@TianHao324 TianHao324 commented Mar 3, 2026

Summary

  • Grid-stride loop optimization: efficient multi-row processing with automatic grid size tuning
  • Memory access optimization: Column-blocked processing with configurable BLOCK_N, Dynamic block size selection (1024-2048) based on tensor width

Testing Done

image
  • Hardware Type: Atlas 800I A2
  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

@TianHao324
Copy link
Contributor Author

benchmark:

**************************************
     BENCHMARKING SPEED for DYT_BETA=FALSE
**************************************
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
********** Benchmark Data **********
[
  {
    "kernel_name": "dyt_beta=False",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.305620014667511,
      0.6067999601364136,
      0.9137799739837646,
      0.6053199768066406,
      0.9159200191497803
    ],
    "y_values_20": [
      0.2649439871311188,
      0.5286800265312195,
      0.7921199798583984,
      0.5303199887275696,
      0.7931720018386841
    ],
    "y_values_80": [
      0.332256019115448,
      0.662880003452301,
      0.9953759908676147,
      0.6664599776268005,
      0.9975600242614746
    ],
    "timestamp": "2026-03-03 08:43:04",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": false}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=False",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.11131999641656876,
      0.21417999267578125,
      0.32378000020980835,
      0.5362799763679504,
      0.7951200008392334
    ],
    "y_values_20": [
      0.11079999804496765,
      0.21367999911308289,
      0.3230000138282776,
      0.5350800156593323,
      0.7939599752426147
    ],
    "y_values_80": [
      0.11181999742984772,
      0.2147960066795349,
      0.3245599865913391,
      0.5374400019645691,
      0.796392023563385
    ],
    "timestamp": "2026-03-03 08:43:06",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": false}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=False",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.5220199823379517,
      0.5443999767303467,
      0.5901000499725342,
      0.6064800024032593,
      0.7080000042915344
    ],
    "y_values_20": [
      0.516759991645813,
      0.5379079580307007,
      0.5850800275802612,
      0.5987600088119507,
      0.7066400051116943
    ],
    "y_values_80": [
      0.5304200053215027,
      0.5547320246696472,
      0.5978599786758423,
      0.6152759790420532,
      0.7096999883651733
    ],
    "timestamp": "2026-03-03 08:43:09",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": false}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=False",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.33070001006126404,
      0.663919985294342,
      1.3897199630737305,
      1.911620020866394,
      2.4967100620269775
    ],
    "y_values_20": [
      0.32910001277923584,
      0.6622039675712585,
      1.3871879577636719,
      1.9094719886779785,
      2.4945199489593506
    ],
    "y_values_80": [
      0.33285999298095703,
      0.6664360165596008,
      1.392199993133545,
      1.9143240451812744,
      2.499039888381958
    ],
    "timestamp": "2026-03-03 08:43:11",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": false}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=False",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.848609983921051,
      0.900920033454895,
      1.2819700241088867,
      1.1191200017929077,
      1.53233003616333
    ],
    "y_values_20": [
      0.813215970993042,
      0.8386960029602051,
      1.166051983833313,
      1.042240023612976,
      1.41895592212677
    ],
    "y_values_80": [
      0.8807560205459595,
      0.9619799852371216,
      1.3676960468292236,
      1.174720048904419,
      1.6217559576034546
    ],
    "timestamp": "2026-03-03 08:43:14",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": false}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=False",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.40946000814437866,
      0.8822699785232544,
      1.6952600479125977,
      2.422960042953491,
      3.2437000274658203
    ],
    "y_values_20": [
      0.4065279960632324,
      0.8795239925384521,
      1.693019986152649,
      2.4205400943756104,
      3.2412161827087402
    ],
    "y_values_80": [
      0.4178040027618408,
      0.8852720260620117,
      1.6983000040054321,
      2.4263200759887695,
      3.246835947036743
    ],
    "timestamp": "2026-03-03 08:43:16",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": false}",
    "liger_version": "0.0.0"
  }
]
**************************************
     BENCHMARKING MEMORY for DYT_BETA=FALSE
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "dyt_beta=False",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      40.18505859375,
      80.361328125,
      120.537109375,
      160.71337890625,
      200.8896484375
    ],
    "y_values_20": [
      40.18505859375,
      80.361328125,
      120.537109375,
      160.71337890625,
      200.8896484375
    ],
    "y_values_80": [
      40.18505859375,
      80.361328125,
      120.537109375,
      160.71337890625,
      200.8896484375
    ],
    "timestamp": "2026-03-03 08:43:17",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": false}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=False",
    "kernel_provider": "torch",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      128.0244140625,
      256.0400390625,
      384.0556640625,
      512.0712890625,
      640.0869140625
    ],
    "y_values_20": [
      128.0244140625,
      256.0400390625,
      384.0556640625,
      512.0712890625,
      640.0869140625
    ],
    "y_values_80": [
      128.0244140625,
      256.0400390625,
      384.0556640625,
      512.0712890625,
      640.0869140625
    ],
    "timestamp": "2026-03-03 08:43:17",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": false}",
    "liger_version": "0.0.0"
  }
]
**************************************
     BENCHMARKING SPEED for DYT_BETA=TRUE
**************************************
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
[WARNING] Please DO NOT tune args ['num_warps', 'num_stages']!
********** Benchmark Data **********
[
  {
    "kernel_name": "dyt_beta=True",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.3036699891090393,
      0.61080002784729,
      0.9052900075912476,
      0.6058000326156616,
      0.9066699743270874
    ],
    "y_values_20": [
      0.2655879855155945,
      0.5295000076293945,
      0.79311203956604,
      0.5323200225830078,
      0.7959439754486084
    ],
    "y_values_80": [
      0.33339202404022217,
      0.664080023765564,
      0.9963359832763672,
      0.668179988861084,
      0.9982080459594727
    ],
    "timestamp": "2026-03-03 08:43:19",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": true}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=True",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.13777999579906464,
      0.24799999594688416,
      0.4065200090408325,
      0.7241799831390381,
      1.073099970817566
    ],
    "y_values_20": [
      0.1371999979019165,
      0.24726000428199768,
      0.40501201152801514,
      0.7222800254821777,
      1.071619987487793
    ],
    "y_values_80": [
      0.13833999633789062,
      0.2487799972295761,
      0.4079599976539612,
      0.725820004940033,
      1.07451593875885
    ],
    "timestamp": "2026-03-03 08:43:22",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": true}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=True",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.5908399820327759,
      0.6938700079917908,
      0.7505500316619873,
      0.7957000136375427,
      0.78711998462677
    ],
    "y_values_20": [
      0.5816920399665833,
      0.6853479743003845,
      0.7448760271072388,
      0.7859039902687073,
      0.7793599963188171
    ],
    "y_values_80": [
      0.6561279892921448,
      0.7052800059318542,
      0.7572600245475769,
      0.8079880475997925,
      0.7981320023536682
    ],
    "timestamp": "2026-03-03 08:43:25",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": true}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=True",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.35947999358177185,
      0.6746000051498413,
      1.4288400411605835,
      1.9935200214385986,
      2.5829598903656006
    ],
    "y_values_20": [
      0.35725998878479004,
      0.672760009765625,
      1.4257999658584595,
      1.9907159805297852,
      2.58024001121521
    ],
    "y_values_80": [
      0.3616720139980316,
      0.6781399846076965,
      1.431439995765686,
      1.9970840215682983,
      2.585520029067993
    ],
    "timestamp": "2026-03-03 08:43:27",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": true}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=True",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      1.0710999965667725,
      1.0758800506591797,
      1.3090300559997559,
      1.1418800354003906,
      1.5692698955535889
    ],
    "y_values_20": [
      1.0500799417495728,
      1.0616040229797363,
      1.1925960779190063,
      1.064452052116394,
      1.4506239891052246
    ],
    "y_values_80": [
      1.0873600244522095,
      1.0903879404067993,
      1.3936160802841187,
      1.1992640495300293,
      1.652932047843933
    ],
    "timestamp": "2026-03-03 08:43:30",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": true}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=True",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      0.49211999773979187,
      0.9529600143432617,
      1.8107000589370728,
      2.6939001083374023,
      3.613649845123291
    ],
    "y_values_20": [
      0.48288798332214355,
      0.9500359892845154,
      1.8075920343399048,
      2.6895318031311035,
      3.6110799312591553
    ],
    "y_values_80": [
      0.5049120187759399,
      0.9559919834136963,
      1.813844084739685,
      2.6974880695343018,
      3.6171000003814697
    ],
    "timestamp": "2026-03-03 08:43:33",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": true}",
    "liger_version": "0.0.0"
  }
]
**************************************
     BENCHMARKING MEMORY for DYT_BETA=TRUE
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "dyt_beta=True",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      40.3603515625,
      80.71044921875,
      121.06005859375,
      161.41015625,
      201.76025390625
    ],
    "y_values_20": [
      40.3603515625,
      80.71044921875,
      121.06005859375,
      161.41015625,
      201.76025390625
    ],
    "y_values_80": [
      40.3603515625,
      80.71044921875,
      121.06005859375,
      161.41015625,
      201.76025390625
    ],
    "timestamp": "2026-03-03 08:43:33",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": true}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "dyt_beta=True",
    "kernel_provider": "torch",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "hidden_size",
    "x_label": "hidden_size",
    "x_values": [
      1024,
      2048,
      3072,
      4096,
      5120
    ],
    "y_values_50": [
      128.0419921875,
      256.0732421875,
      384.1044921875,
      512.1357421875,
      640.1669921875
    ],
    "y_values_20": [
      128.0419921875,
      256.0732421875,
      384.1044921875,
      512.1357421875,
      640.1669921875
    ],
    "y_values_80": [
      128.0419921875,
      256.0732421875,
      384.1044921875,
      512.1357421875,
      640.1669921875
    ],
    "timestamp": "2026-03-03 08:43:33",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"BT\": 4096, \"dtype\": \"torch.bfloat16\", \"beta\": true}",
    "liger_version": "0.0.0"
  }
]

@TianHao324
Copy link
Contributor Author

@Tcc0403 would you mind having a preview?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant