YdrMaster
diff --git a/‎src/ops/causal_softmax/bang/causal_softmax_bang.h‎
Lines changed: 1 addition & 1 deletion b/‎src/ops/causal_softmax/bang/causal_softmax_bang.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ops/causal_softmax/bang/causal_softmax_bang.mlu‎
Lines changed: 2 additions & 2 deletions b/‎src/ops/causal_softmax/bang/causal_softmax_bang.mlu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/ops/causal_softmax/bang/causal_softmax_cnnl.cc‎
Lines changed: 1 addition & 1 deletion b/‎src/ops/causal_softmax/bang/causal_softmax_cnnl.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ops/causal_softmax/bang/causal_softmax_cnnl.h‎
Lines changed: 1 addition & 1 deletion b/‎src/ops/causal_softmax/bang/causal_softmax_cnnl.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ops/causal_softmax/cuda/causal_softmax.cc‎
Lines changed: 7 additions & 7 deletions b/‎src/ops/causal_softmax/cuda/causal_softmax.cc‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/ops/causal_softmax/cuda/causal_softmax.cu‎
Lines changed: 12 additions & 12 deletions b/‎src/ops/causal_softmax/cuda/causal_softmax.cu‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎src/ops/causal_softmax/cuda/causal_softmax.cuh‎
Lines changed: 7 additions & 7 deletions b/‎src/ops/causal_softmax/cuda/causal_softmax.cuh‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/ops/matmul/ascend/matmul_aclnn.cc‎
Lines changed: 10 additions & 11 deletions b/‎src/ops/matmul/ascend/matmul_aclnn.cc‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎src/ops/random_sample/bang/random_sample_bang.h‎
Lines changed: 1 addition & 1 deletion b/‎src/ops/random_sample/bang/random_sample_bang.h‎
Lines changed: 1 addition & 1 deletion
@@ -25,7 +25,7 @@ infiniopStatus_t bangGetCausalSoftmaxWorkspaceSize(CausalSoftmaxBangDescriptor_t
 
 infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream);
 
 
@@ -787,7 +787,7 @@ void causal_softmax_bang_f16(CausalSoftmaxBangDescriptor_t desc, void *workspace
 
 infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream) {
     if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
@@ -798,4 +798,4 @@ infiniopStatus_t bangCausalSoftmax(CausalSoftmaxBangDescriptor_t desc,
         return STATUS_SUCCESS;
     }
     return STATUS_BAD_TENSOR_DTYPE;
-}
+}
@@ -52,7 +52,7 @@ infiniopStatus_t cnnlDestroyCausalSoftmaxDescriptor(CausalSoftmaxCnnlDescriptor_
 
 infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream) {
     if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
 
@@ -26,7 +26,7 @@ infiniopStatus_t cnnlGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCnnlDescriptor_t
 
 infiniopStatus_t cnnlCausalSoftmax(CausalSoftmaxCnnlDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream);
 
 
@@ -5,20 +5,20 @@
 infiniopStatus_t cudaCreateCausalSoftmaxDescriptor(CudaHandle_t handle,
                                                    CausalSoftmaxCudaDescriptor_t *desc_ptr,
                                                    infiniopTensorDescriptor_t y) {
-    unsigned long int ndim = y->ndim;
+    uint64_t ndim = y->ndim;
     // TODO: only support 2d or 3d tensor
     if (ndim != 2 && ndim != 3) {
         return STATUS_BAD_TENSOR_SHAPE;
     }
     if (!dtype_eq(y->dt, F16)) {
         return STATUS_BAD_TENSOR_DTYPE;
     }
-    unsigned long int total_seq_len = y->shape[ndim - 1];
-    unsigned long int seq_len = y->shape[ndim - 2];
-    unsigned long int batch_size = 1;
-    unsigned long int stride_b = 0;
-    unsigned long int stride_i = y->strides[ndim - 2];
-    unsigned long int stride_j = y->strides[ndim - 1];
+    uint64_t total_seq_len = y->shape[ndim - 1];
+    uint64_t seq_len = y->shape[ndim - 2];
+    uint64_t batch_size = 1;
+    uint64_t stride_b = 0;
+    uint64_t stride_i = y->strides[ndim - 2];
+    uint64_t stride_j = y->strides[ndim - 1];
     if (stride_j != 1) {
         return STATUS_BAD_TENSOR_STRIDES;
     }
 
@@ -218,17 +218,17 @@ __global__ void fused_softmax_standard(
 }
 
 
-void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void* y, void *stream) {
-    unsigned long int total_seq_len = desc->total_seq_len;
-    unsigned long int seq_len = desc->seq_len;
-    unsigned long int batch_size = desc->batch_size;
-    unsigned long int stride_x = desc->stride_b;
-    unsigned long int stride_y = desc->stride_i;
-    unsigned long int stride_z = desc->stride_j;// covert byte strides to element strides
+void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void *y, void *stream) {
+    uint64_t total_seq_len = desc->total_seq_len;
+    uint64_t seq_len = desc->seq_len;
+    uint64_t batch_size = desc->batch_size;
+    uint64_t stride_x = desc->stride_b;
+    uint64_t stride_y = desc->stride_i;
+    uint64_t stride_z = desc->stride_j;// covert byte strides to element strides
     unsigned int max_items_per_thread = desc->max_items_per_thread;
 
     dim3 grid(batch_size, seq_len);
-    
+
     if (max_items_per_thread == 1) {
         fused_softmax_padding<MAX_THREADS_PER_BLOCK>
             <<<grid, total_seq_len, 0, (cudaStream_t) stream>>>((half *) (y), stride_x, stride_y, stride_z);
@@ -243,13 +243,13 @@ void causal_softmax_nv_gpu_f16(CausalSoftmaxCudaDescriptor_t desc, void* y, void
 
 infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
-                                   void *stream){
-    if(cudaSetDevice(desc->device_id) != cudaSuccess){
+                                   void *stream) {
+    if (cudaSetDevice(desc->device_id) != cudaSuccess) {
         return STATUS_BAD_DEVICE;
     }
-    if (dtype_eq(desc->dtype, F16)){
+    if (dtype_eq(desc->dtype, F16)) {
         causal_softmax_nv_gpu_f16(desc, data, stream);
         return STATUS_SUCCESS;
     }
 
@@ -8,12 +8,12 @@ struct CausalSoftmaxCudaDescriptor {
     Device device;
     int device_id;
     DT dtype;
-    unsigned long int batch_size;
-    unsigned long int stride_b;
-    unsigned long int seq_len;
-    unsigned long int stride_i;
-    unsigned long int total_seq_len;
-    unsigned long int stride_j;
+    uint64_t batch_size;
+    uint64_t stride_b;
+    uint64_t seq_len;
+    uint64_t stride_i;
+    uint64_t total_seq_len;
+    uint64_t stride_j;
     unsigned int max_items_per_thread;
 };
 
@@ -27,7 +27,7 @@ infiniopStatus_t cudaGetCausalSoftmaxWorkspaceSize(CausalSoftmaxCudaDescriptor_t
 
 infiniopStatus_t cudaCausalSoftmax(CausalSoftmaxCudaDescriptor_t desc,
                                    void *workspace,
-                                   unsigned long int workspace_size,
+                                   uint64_t workspace_size,
                                    void *data,
                                    void *stream);
 
 
@@ -69,13 +69,12 @@ infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
     // aclnnGemm support C = alpha * A @ B + beta * C
     // see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
     ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, (*desc_ptr)->alpha, (*desc_ptr)->beta, transA, transB, tc,
-                                        (*desc_ptr)->mt, &workspaceSize, &executor);
+                                    (*desc_ptr)->mt, &workspaceSize, &executor);
     CHECK_RET(ret == ACL_SUCCESS,
-            LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
-            return STATUS_EXECUTION_FAILED);
+              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
     aclSetAclOpExecutorRepeatable(executor);
 
-
     return STATUS_SUCCESS;
 }
 
@@ -109,14 +108,14 @@ infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
     aclrtSetDevice(desc->device_id);
 
     for (int i = 0; i < batch; i++) {
-        AclSetTensorAddr(executor, 0, ta, (char *)(a) + i * desc->info->a_matrix.stride * desc->dtype.size);
-        AclSetTensorAddr(executor, 1, tb, (char *)(b) + i * desc->info->b_matrix.stride * desc->dtype.size);
-        AclSetTensorAddr(executor, 2, tc, (char *)(c) + i * desc->info->c_matrix.stride * desc->dtype.size);
-        AclSetTensorAddr(executor, 3, tc, (char *)(c) + i * desc->info->c_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 0, ta, (char *) (a) + i * desc->info->a_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 1, tb, (char *) (b) + i * desc->info->b_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 2, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 3, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
         aclnnStatus ret = aclnnGemm(workspace,
-                        workspaceSize,
-                        executor,
-                        stream);
+                                    workspaceSize,
+                                    executor,
+                                    stream);
         CHECK_RET(ret == ACL_SUCCESS,
                   LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
                   return STATUS_EXECUTION_FAILED);
 
@@ -24,7 +24,7 @@ infiniopStatus_t bangGetRandomSampleWorkspaceSize(RandomSampleBangDescriptor_t d
 
 infiniopStatus_t bangRandomSample(RandomSampleBangDescriptor_t desc,
                                   void *workspace,
-                                  unsigned long int workspace_size,
+                                  uint64_t workspace_size,
                                   void *result,
                                   void const *probs,
                                   float random_val,