reducing pass pipeline (now smaller and faster) (#21)

fschlimb · tkarna · web-flow · commit eafba260ae6d · 2023-05-22T18:45:36.000+02:00
* reducing pass pipeline (now smaller and faster)
* add tests for elementwise power and bitwise and

---------

Co-authored-by: Tuomas Karna &lt;tuomas.karna@intel.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -131,5 +131,9 @@ jobs:
           export DDPT_CRUNNER_SO="$GITHUB_WORKSPACE"/third_party/install/llvm-mlir/lib/libmlir_c_runner_utils.so
           pytest test
           DDPT_FORCE_DIST=1 pytest test
+      - name: Cleanup
+        run: |
+          pip list
+          pip uninstall -y ddptensor
       - run: |
           echo "This job's status is ${{ job.status }}."
diff --git a/src/idtr.cpp b/src/idtr.cpp
@@ -280,12 +280,9 @@ using MRIdx1d = Unranked1DMemRefType<int64_t>;
 template <typename T>
 void _idtr_reduce_all(int64_t dataRank, void *dataDescr, int op) {
   UnrankedMemRefType<T> data(dataRank, dataDescr);
-  auto inout = data.data();
-  auto sizes = data.sizes();
-  auto strides = data.strides();
-  assert(dataRank == 0 || (dataRank == 1 && strides[0] == 1));
+  assert(dataRank == 0 || (dataRank == 1 && data.strides()[0] == 1));
   getTransceiver()->reduce_all(
-      inout, DTYPE<T>::value, dataRank ? sizes[0] : 1,
+      data.data(), DTYPE<T>::value, dataRank ? data.sizes()[0] : 1,
       mlir2ddpt(static_cast<imex::ptensor::ReduceOpId>(op)));
 }
 
diff --git a/src/include/ddptensor/MemRefType.hpp b/src/include/ddptensor/MemRefType.hpp
@@ -20,11 +20,14 @@ template <typename T> class UnrankedMemRefType {
   UnrankedMemRefType(int64_t rank, void *p)
       : _rank(rank), _descriptor(reinterpret_cast<intptr_t *>(p)){};
 
-  T *data() { return reinterpret_cast<T *>(_descriptor[1]); };
+  T *data() { return &reinterpret_cast<T *>(_descriptor[1])[_descriptor[2]]; };
   int64_t rank() const { return _rank; }
-  int64_t *sizes() { return reinterpret_cast<int64_t *>(&_descriptor[3]); };
+  int64_t *sizes() {
+    return _rank ? reinterpret_cast<int64_t *>(&_descriptor[3]) : nullptr;
+  };
   int64_t *strides() {
-    return reinterpret_cast<int64_t *>(&_descriptor[3 + _rank]);
+    return _rank ? reinterpret_cast<int64_t *>(&_descriptor[3 + _rank])
+                 : nullptr;
   };
 };
 
diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp
@@ -405,11 +405,11 @@ static const char *pass_pipeline =
                             "func.func(tosa-to-tensor),"
                             "canonicalize,"
                             "linalg-fuse-elementwise-ops,"
-                            "convert-shape-to-std,"
+                            // "convert-shape-to-std,"
                             "arith-expand,"
                             "memref-expand,"
                             "arith-bufferize,"
-                            "func-bufferize,"
+                            // "func-bufferize,"
                             "func.func(empty-tensor-to-alloc-tensor),"
                             "func.func(scf-bufferize),"
                             "func.func(tensor-bufferize),"
diff --git a/test/test_ewb.py b/test/test_ewb.py
@@ -68,3 +68,21 @@ def test_prod_het(self):
         r = dt.sum(c, [0, 1])
         v = 16 * 16 * 2 * 2
         assert float(r) == v
+
+    def test_pow(self):
+        for dtyp in [dt.int64, dt.float64]:
+            a = dt.full((6, 6), 3, dtype=dtyp)
+            b = dt.full((6, 6), 2, dtype=dtyp)
+            c = a**b
+            r1 = dt.sum(c, [0, 1])
+            v = 6 * 6 * 9
+            assert float(r1) == v
+
+    def test_bitwise_and(self):
+        for dtyp in mpi_idtypes:
+            a = dt.full((6, 6), 3, dtype=dtyp)
+            b = dt.full((6, 6), 2, dtype=dtyp)
+            c = a & b
+            r1 = dt.sum(c, [0, 1])
+            v = 6 * 6 * 2
+            assert float(r1) == v