adding (but not using) inplace binop support

fschlimb · fschlimb · commit 24aac84dee71 · 2023-03-07T08:36:51.000-06:00
diff --git a/ddptensor/ddptensor.py b/ddptensor/ddptensor.py
@@ -24,15 +24,12 @@ def __repr__(self):
                 f"{method} = lambda self, other: dtensor(_cdt.EWBinOp.op(_cdt.{METHOD}, self._t, other._t if isinstance(other, dtensor) else other))"
             )
 
-    def _inplace(self, t):
-        self._t = t
-        return self
-
-    for method in api.api_categories["IEWBinOp"]:
-        METHOD = method.upper()
-        exec(
-            f"{method} = lambda self, other: self._inplace(_cdt.IEWBinOp.op(_cdt.{METHOD}, self._t, other._t if isinstance(other, dtensor) else other))"
-        )
+    # inplace operators still lead to an assignment, needs more involved analysis
+    # for method in api.api_categories["IEWBinOp"]:
+    #     METHOD = method.upper()
+    #     exec(
+    #         f"{method} = lambda self, other: (self, _cdt.IEWBinOp.op(_cdt.{METHOD}, self._t, other._t if isinstance(other, dtensor) else other)[0])"
+    #     )
 
     for method in api.api_categories["EWUnyOp"]:
         if method.startswith("__"):
diff --git a/src/EWBinOp.cpp b/src/EWBinOp.cpp
@@ -5,7 +5,6 @@
 */
 
 #include "ddptensor/EWBinOp.hpp"
-#include "ddptensor/CollComm.hpp"
 #include "ddptensor/Creator.hpp"
 #include "ddptensor/DDPTensorImpl.hpp"
 #include "ddptensor/Factory.hpp"
@@ -449,14 +448,6 @@ struct DeferredEWBinOp : public Deferred {
       : Deferred(a.dtype(), std::max(a.rank(), b.rank()), true), _a(a.id()),
         _b(b.id()), _op(op) {}
 
-  void run() override {
-#if 0
-        const auto a = std::move(Registry::get(_a).get());
-        const auto b = std::move(Registry::get(_b).get());
-        set_value(std::move(TypeDispatch<x::EWBinOp>(a, b, _op)));
-#endif
-  }
-
   bool generate_mlir(::mlir::OpBuilder &builder, ::mlir::Location loc,
                      jit::DepManager &dm) override {
     // FIXME the type of the result is based on a only
diff --git a/src/IEWBinOp.cpp b/src/IEWBinOp.cpp
@@ -8,80 +8,47 @@
 #include "ddptensor/Creator.hpp"
 #include "ddptensor/DDPTensorImpl.hpp"
 #include "ddptensor/Factory.hpp"
+#include "ddptensor/Registry.hpp"
 #include "ddptensor/TypeDispatch.hpp"
 
-#if 0
-namespace x {
+#include <imex/Dialect/Dist/IR/DistOps.h>
+#include <imex/Dialect/PTensor/IR/PTensorOps.h>
+#include <mlir/Dialect/Shape/IR/Shape.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinTypeInterfaces.h>
 
-    class IEWBinOp
-    {
-    public:
-        using ptr_type = DPTensorBaseX::ptr_type;
-
-        template<typename A, typename B>
-        static ptr_type op(IEWBinOpId iop, std::shared_ptr<DPTensorX<A>> a_ptr, const std::shared_ptr<DPTensorX<B>> & b_ptr)
-        {
-            auto & ax = a_ptr->xarray();
-            const auto & bx = b_ptr->xarray();
-            if(a_ptr->is_sliced() || b_ptr->is_sliced()) {
-                auto av = xt::strided_view(ax, a_ptr->lslice());
-                const auto & bv = xt::strided_view(bx, b_ptr->lslice());
-                return do_op(iop, av, bv, a_ptr);
-            }
-            return do_op(iop, ax, bx, a_ptr);
-        }
-
-#pragma GCC diagnostic ignored "-Wswitch"
-        template<typename A, typename T1, typename T2>
-        static ptr_type do_op(IEWBinOpId iop, T1 & a, const T2 & b, std::shared_ptr<DPTensorX<A>> a_ptr)
-        {
-            switch(iop) {
-            case __IADD__:
-                a += b;
-                return a_ptr;
-            case __IFLOORDIV__:
-                a = xt::floor(a / b);
-                return a_ptr;
-            case __IMUL__:
-                a *= b;
-                return a_ptr;
-            case __ISUB__:
-                a -= b;
-                return a_ptr;
-            case __ITRUEDIV__:
-                a /= b;
-                return a_ptr;
-            case __IPOW__:
-                throw std::runtime_error("Binary inplace operation not implemented");
-            }
-            if constexpr (std::is_integral<typename T1::value_type>::value && std::is_integral<typename T2::value_type>::value) {
-                switch(iop) {
-                case __IMOD__:
-                    a %= b;
-                    return a_ptr;
-                case __IOR__:
-                    a |= b;
-                    return a_ptr;
-                case __IAND__:
-                    a &= b;
-                    return a_ptr;
-                case __IXOR__:
-                    a ^= b;
-                case __ILSHIFT__:
-                    a = xt::left_shift(a, b);
-                    return a_ptr;
-                case __IRSHIFT__:
-                    a = xt::right_shift(a, b);
-                    return a_ptr;
-                }
-            }
-            throw std::runtime_error("Unknown/invalid inplace elementwise binary operation");
-        }
-#pragma GCC diagnostic pop
-
-    };
-} // namespace x
-#endif // if 0
+// convert id of our binop to id of imex::ptensor binop
+static ::imex::ptensor::EWBinOpId ddpt2mlir(const IEWBinOpId bop) {
+  switch (bop) {
+  case __IADD__:
+    return ::imex::ptensor::ADD;
+  case __IAND__:
+    return ::imex::ptensor::BITWISE_AND;
+  case __IFLOORDIV__:
+    return ::imex::ptensor::FLOOR_DIVIDE;
+  case __ILSHIFT__:
+    return ::imex::ptensor::BITWISE_LEFT_SHIFT;
+  case __IMOD__:
+    return ::imex::ptensor::MODULO;
+  case __IMUL__:
+    return ::imex::ptensor::MULTIPLY;
+  case __IOR__:
+    return ::imex::ptensor::BITWISE_OR;
+  case __IPOW__:
+    return ::imex::ptensor::POWER;
+  case __IRSHIFT__:
+    return ::imex::ptensor::BITWISE_RIGHT_SHIFT;
+  case __ISUB__:
+    return ::imex::ptensor::SUBTRACT;
+  case __ITRUEDIV__:
+    return ::imex::ptensor::TRUE_DIVIDE;
+  case __IXOR__:
+    return ::imex::ptensor::BITWISE_XOR;
+  default:
+    throw std::runtime_error(
+        "Unknown/invalid inplace elementwise binary operation");
+  }
+}
 
 struct DeferredIEWBinOp : public Deferred {
   id_type _a;
@@ -91,15 +58,45 @@ struct DeferredIEWBinOp : public Deferred {
   DeferredIEWBinOp() = default;
   DeferredIEWBinOp(IEWBinOpId op, const tensor_i::future_type &a,
                    const tensor_i::future_type &b)
-      : _a(a.id()), _b(b.id()), _op(op) {}
+      : Deferred(a.dtype(), a.rank(), a.balanced()), _a(a.id()), _b(b.id()),
+        _op(op) {}
+
+  bool generate_mlir(::mlir::OpBuilder &builder, ::mlir::Location loc,
+                     jit::DepManager &dm) override {
+    // FIXME the type of the result is based on a only
+    auto av = dm.getDependent(builder, _a);
+    auto bv = dm.getDependent(builder, _b);
 
-  void run() {
-    // const auto a = std::move(Registry::get(_a).get());
-    // const auto b = std::move(Registry::get(_b).get());
-    // set_value(std::move(TypeDispatch<x::IEWBinOp>(a, b, _op)));
+    auto aTyp = ::imex::dist::getPTensorType(av);
+    ::mlir::SmallVector<int64_t> shape(rank(), ::mlir::ShapedType::kDynamic);
+    auto outTyp =
+        ::imex::ptensor::PTensorType::get(shape, aTyp.getElementType());
+
+    auto binop = builder.create<::imex::ptensor::EWBinOp>(
+        loc, outTyp, builder.getI32IntegerAttr(ddpt2mlir(_op)), av, bv);
+    // insertsliceop has no return value, so we just create the op...
+    auto zero = ::imex::createIndex(loc, builder, 0);
+    auto one = ::imex::createIndex(loc, builder, 1);
+    auto dyn = ::imex::createIndex(loc, builder, ::mlir::ShapedType::kDynamic);
+    ::mlir::SmallVector<::mlir::Value> offs(rank(), zero);
+    ::mlir::SmallVector<::mlir::Value> szs(rank(), dyn);
+    ::mlir::SmallVector<::mlir::Value> strds(rank(), one);
+    (void)builder.create<::imex::ptensor::InsertSliceOp>(loc, av, binop, offs,
+                                                         szs, strds);
+    // ... and use av as to later create the ptensor
+    dm.addVal(this->guid(), av,
+              [this](Transceiver *transceiver, uint64_t rank, void *allocated,
+                     void *aligned, intptr_t offset, const intptr_t *sizes,
+                     const intptr_t *strides, uint64_t *gs_allocated,
+                     uint64_t *gs_aligned, uint64_t *lo_allocated,
+                     uint64_t *lo_aligned, uint64_t balanced) {
+                this->set_value(Registry::get(this->_a).get());
+              });
+    return false;
   }
 
   FactoryId factory() const { return F_IEWBINOP; }
+
   template <typename S> void serialize(S &ser) {
     ser.template value<sizeof(_a)>(_a);
     ser.template value<sizeof(_b)>(_b);
diff --git a/src/SetGetItem.cpp b/src/SetGetItem.cpp
@@ -204,12 +204,6 @@ struct DeferredSetItem : public Deferred {
                   const std::vector<py::slice> &v)
       : _a(a.id()), _b(b.id()), _slc(v) {}
 
-  void run() {
-    // const auto a = std::move(Registry::get(_a).get());
-    // const auto b = std::move(Registry::get(_b).get());
-    // set_value(std::move(TypeDispatch<x::SetItem>(a, b, _slc, _b)));
-  }
-
   bool generate_mlir(::mlir::OpBuilder &builder, ::mlir::Location loc,
                      jit::DepManager &dm) override {
     // get params and extract offsets/sizes/strides
@@ -229,9 +223,9 @@ struct DeferredSetItem : public Deferred {
       sizesV[i] = ::imex::createIndex(loc, builder, sizes[i]);
       stridesV[i] = ::imex::createIndex(loc, builder, strides[i]);
     }
-    // insertsliceop has no return value, so we just craete the op...
-    builder.create<::imex::ptensor::InsertSliceOp>(loc, av, bv, offsV, sizesV,
-                                                   stridesV);
+    // insertsliceop has no return value, so we just create the op...
+    (void)builder.create<::imex::ptensor::InsertSliceOp>(loc, av, bv, offsV,
+                                                         sizesV, stridesV);
     // ... and use av as to later create the ptensor
     dm.addVal(this->guid(), av,
               [this](Transceiver *transceiver, uint64_t rank, void *allocated,
diff --git a/src/include/ddptensor/Deferred.hpp b/src/include/ddptensor/Deferred.hpp
@@ -23,7 +23,10 @@ struct Runable {
   using ptr_type = std::unique_ptr<Runable>;
   virtual ~Runable(){};
   /// actually execute, a deferred will set value of future
-  virtual void run() = 0;
+  virtual void run() {
+    throw(std::runtime_error(
+        "No immediate execution support for this operation."));
+  };
   /// generate MLIR code for jit
   /// the runable might not generate MLIR and instead return true
   /// to request the scheduler to execute the run method instead.
diff --git a/test/stencil-2d.py b/test/stencil-2d.py
@@ -144,7 +144,6 @@ def main():
     for i in range(n):
         for j in range(n):
             A[i, j] = float(i + j)
-    print(A.dtype)
     B = numpy.zeros((n, n), dtype=numpy.float64)
 
     for k in range(iterations + 1):
@@ -154,9 +153,8 @@ def main():
 
         if pattern == "star":
             if r == 2:
-                B[2 : n - 2, 2 : n - 2] = (
-                    B[2 : n - 2, 2 : n - 2]
-                    + W[2, 2] * A[2 : n - 2, 2 : n - 2]
+                B[2 : n - 2, 2 : n - 2] += (
+                    W[2, 2] * A[2 : n - 2, 2 : n - 2]
                     + W[2, 0] * A[2 : n - 2, 0 : n - 4]
                     + W[2, 1] * A[2 : n - 2, 1 : n - 3]
                     + W[2, 3] * A[2 : n - 2, 3 : n - 1]
@@ -168,11 +166,10 @@ def main():
                 )
             else:
                 b = n - r
-                B[r:b, r:b] = B[r:b, r:b] + W[r, r] * A[r:b, r:b]
+                B[r:b, r:b] += W[r, r] * A[r:b, r:b]
                 for s in range(1, r + 1):
-                    B[r:b, r:b] = (
-                        B[r:b, r:b]
-                        + W[r, r - s] * A[r:b, r - s : b - s]
+                    B[r:b, r:b] += (
+                        W[r, r - s] * A[r:b, r - s : b - s]
                         + W[r, r + s] * A[r:b, r + s : b + s]
                         + W[r - s, r] * A[r - s : b - s, r:b]
                         + W[r + s, r] * A[r + s : b + s, r:b]
@@ -182,11 +179,7 @@ def main():
                 b = n - r
                 for s in range(-r, r + 1):
                     for t in range(-r, r + 1):
-                        B[r:b, r:b] = (
-                            B[r:b, r:b]
-                            + W[r + t, r + s] * A[r + t : b + t, r + s : b + s]
-                        )
-
+                        B[r:b, r:b] += W[r + t, r + s] * A[r + t : b + t, r + s : b + s]
         A = A + 1.0
 
     t1 = timer()
@@ -196,7 +189,9 @@ def main():
     # * Analyze and output results.
     # ******************************************************************************
 
-    print(W, B)
+    print(W)
+    print("********************************")
+    print(B)
     # norm = numpy.linalg.norm(numpy.reshape(B,n*n),ord=1)
     # active_points = (n-2*r)**2
     # norm /= active_points