adjusting to new imex; support getitem resulting in 0d tensor; let idtr work outside of ddpt

fschlimb · fschlimb · commit 217e862fcb65 · 2023-03-01T09:37:45.000-06:00
diff --git a/ddptensor/__init__.py b/ddptensor/__init__.py
@@ -50,7 +50,7 @@ def to_numpy(a):
     if not op.startswith("__"):
         OP = op.upper()
         exec(
-            f"{op} = lambda this, other: dtensor(_cdt.EWBinOp.op(_cdt.{OP}, this._t, other._t if isinstance(other, ddptensor) else other))"
+            f"{op} = lambda this, other: dtensor(_cdt.EWBinOp.op(_cdt.{OP}, this._t if isinstance(this, ddptensor) else this, other._t if isinstance(other, ddptensor) else other))"
         )
 
 for op in api.api_categories["EWUnyOp"]:
diff --git a/ddptensor/ddptensor.py b/ddptensor/ddptensor.py
@@ -51,7 +51,11 @@ def _inplace(self, t):
         )
 
     def __getitem__(self, key):
-        return dtensor(self._t.__getitem__(key if isinstance(key, tuple) else (key,)))
+        key = key if isinstance(key, tuple) else (key,)
+        key = [x if isinstance(x, slice) else slice(x, x+1, 1) for x in key]
+        return dtensor(self._t.__getitem__(key))
 
     def __setitem__(self, key, value):
-         self._t.__setitem__(key if isinstance(key, tuple) else (key,), value._t) # if isinstance(value, dtensor) else value)
+        key = key if isinstance(key, tuple) else (key,)
+        key = [x if isinstance(x, slice) else slice(x, x+1, 1) for x in key]
+        self._t.__setitem__(key, value._t) # if isinstance(value, dtensor) else value)
diff --git a/src/Creator.cpp b/src/Creator.cpp
@@ -241,14 +241,14 @@ ddptensor * Creator::arange(uint64_t start, uint64_t end, uint64_t step, DTypeId
     return new ddptensor(defer<DeferredArange>(start, end, step, dtype, team));
 }
 
-ddptensor * Creator::mk_future(const py::object & b)
+std::pair<ddptensor *, bool> Creator::mk_future(const py::object & b)
 {
     if(py::isinstance<ddptensor>(b)) {
-        return b.cast<ddptensor*>();
+        return {b.cast<ddptensor*>(), false};
     } else if(py::isinstance<py::float_>(b)) {
-        return Creator::full({}, b, FLOAT64);
+        return {Creator::full({}, b, FLOAT64), true};
     } else if(py::isinstance<py::int_>(b)) {
-        return Creator::full({}, b, INT64);
+        return {Creator::full({}, b, INT64), true};
     }
     throw std::runtime_error("Invalid right operand to elementwise binary operation");
 };
diff --git a/src/Deferred.cpp b/src/Deferred.cpp
@@ -107,7 +107,7 @@ void process_promises()
             // create return statement and adjust function type
             uint64_t osz = dm.handleResult(builder);
             // also request generation of c-wrapper function
-            function->setAttr(::mlir::LLVM::LLVMDialect::getEmitCWrapperAttrName(), ::mlir::UnitAttr::get(&jit._context));
+            function->setAttr(::mlir::LLVM::LLVMDialect::getEmitCWrapperAttrName(), builder.getUnitAttr());
             function.getFunctionType().dump(); std::cout << std::endl;
             // add the function to the module
             module.push_back(function);
diff --git a/src/EWBinOp.cpp b/src/EWBinOp.cpp
@@ -444,7 +444,7 @@ struct DeferredEWBinOp : public Deferred
 
     DeferredEWBinOp() = default;
     DeferredEWBinOp(EWBinOpId op, const tensor_i::future_type & a, const tensor_i::future_type & b)
-        : Deferred(a.dtype(), a.rank(), true),
+        : Deferred(a.dtype(), std::max(a.rank(), b.rank()), true),
           _a(a.id()), _b(b.id()), _op(op)
     {}
 
@@ -462,12 +462,13 @@ struct DeferredEWBinOp : public Deferred
         // FIXME the type of the result is based on a only
         auto av = dm.getDependent(builder, _a);
         auto bv = dm.getDependent(builder, _b);
-
-        auto aPtTyp = ::imex::dist::getPTensorType(av);
-        assert(aPtTyp);
+        
+        auto aTyp = ::imex::dist::getPTensorType(av);
+        ::mlir::SmallVector<int64_t> shape(rank(), ::mlir::ShapedType::kDynamic);
+        auto outTyp = ::imex::ptensor::PTensorType::get(shape, aTyp.getElementType());
 
         dm.addVal(this->guid(),
-                  builder.create<::imex::ptensor::EWBinOp>(loc, aPtTyp, builder.getI32IntegerAttr(ddpt2mlir(_op)), av, bv),
+                  builder.create<::imex::ptensor::EWBinOp>(loc, outTyp, builder.getI32IntegerAttr(ddpt2mlir(_op)), av, bv),
                   [this](Transceiver * transceiver, uint64_t rank, void *allocated, void *aligned, intptr_t offset, const intptr_t * sizes, const intptr_t * strides,
                          uint64_t * gs_allocated, uint64_t * gs_aligned, uint64_t * lo_allocated, uint64_t * lo_aligned, uint64_t balanced) {
             this->set_value(std::move(mk_tnsr(transceiver, _dtype, rank, allocated, aligned, offset, sizes, strides,
@@ -490,13 +491,17 @@ struct DeferredEWBinOp : public Deferred
     }
 };
 
-ddptensor * EWBinOp::op(EWBinOpId op, const ddptensor & a, const py::object & b)
+ddptensor * EWBinOp::op(EWBinOpId op, const py::object & a, const py::object & b)
 {
     auto bb = Creator::mk_future(b);
+    auto aa = Creator::mk_future(a);
     if(op == __MATMUL__) {
-        return LinAlgOp::vecdot(a, *bb, 0);
+        return LinAlgOp::vecdot(*aa.first, *bb.first, 0);
     }
-    return new ddptensor(defer<DeferredEWBinOp>(op, a.get(), bb->get()));
+    auto res = new ddptensor(defer<DeferredEWBinOp>(op, aa.first->get(), bb.first->get()));
+    if(aa.second) delete aa.first;
+    if(bb.second) delete bb.first;
+    return res;    
 }
 
 FACTORY_INIT(DeferredEWBinOp, F_EWBINOP);
diff --git a/src/IEWBinOp.cpp b/src/IEWBinOp.cpp
@@ -112,7 +112,9 @@ struct DeferredIEWBinOp : public Deferred
 ddptensor * IEWBinOp::op(IEWBinOpId op, ddptensor & a, const py::object & b)
 {
     auto bb = Creator::mk_future(b);
-    return new ddptensor(defer<DeferredIEWBinOp>(op, a.get(), bb->get()));
+    auto res = new ddptensor(defer<DeferredIEWBinOp>(op, a.get(), bb.first->get()));
+    if(bb.second) delete bb.first;
+    return res;
 }
 
 FACTORY_INIT(DeferredIEWBinOp, F_IEWBINOP);
diff --git a/src/ReduceOp.cpp b/src/ReduceOp.cpp
@@ -126,7 +126,7 @@ struct DeferredReduceOp : public Deferred
         assert(aPtTyp);
         ::mlir::Type dtype = aPtTyp.getElementType();
         // return type 0d with same dtype as input
-        auto retPtTyp = ::imex::ptensor::PTensorType::get(builder.getContext(), 0, dtype, false);
+        auto retPtTyp = ::imex::ptensor::PTensorType::get({::mlir::ShapedType::kDynamic}, dtype);
         // reduction op
         auto mop = ddpt2mlir(_op);
         auto op = builder.getIntegerAttr(builder.getIntegerType(sizeof(mop)*8), mop);
diff --git a/src/SetGetItem.cpp b/src/SetGetItem.cpp
@@ -280,22 +280,28 @@ struct DeferredGetItem : public Deferred
         auto & strides = _slc.strides();
         auto nd = offs.size();
         // convert C++ slices into vectors of MLIR Values
-        std::vector<::mlir::Value> offsV(nd);
-        std::vector<::mlir::Value> sizesV(nd);
-        std::vector<::mlir::Value> stridesV(nd);
+        std::vector<::mlir::OpFoldResult> offsV(nd);
+        std::vector<::mlir::OpFoldResult> sizesV(nd);
+        std::vector<::mlir::OpFoldResult> stridesV(nd);
+        ::mlir::SmallVector<int64_t> shape(nd, ::mlir::ShapedType::kDynamic);
         for(auto i = 0; i<nd; ++i) {
             offsV[i] = ::imex::createIndex(loc, builder, offs[i]);
-            sizesV[i] = ::imex::createIndex(loc, builder, sizes[i]);
             stridesV[i] = ::imex::createIndex(loc, builder, strides[i]);
+            if(sizes[i] == 1) {
+                sizesV[i] = builder.getIndexAttr(sizes[i]);
+                shape[i] = sizes[i];
+            } else {
+                sizesV[i] = ::imex::createIndex(loc, builder, sizes[i]);
+            }
         }
+
+        auto oTyp = ::imex::dist::getPTensorType(av);
+        // auto outnd = nd == 0 || _slc.size() == 1 ? 0 : nd;
+        auto outTyp = ::imex::ptensor::PTensorType::get(shape, oTyp.getElementType());
         // now we can create the PTensor op using the above Values
+        auto res = builder.create<::imex::ptensor::SubviewOp>(loc, outTyp, av, offsV, sizesV, stridesV);
         dm.addVal(this->guid(),
-                  builder.create<::imex::ptensor::ExtractSliceOp>(loc,
-                      ::imex::dist::getPTensorType(av),
-                      av,
-                      offsV,
-                      sizesV,
-                      stridesV),
+                  res,
                   [this, dtype](Transceiver * transceiver, uint64_t rank, void *allocated, void *aligned, intptr_t offset, const intptr_t * sizes, const intptr_t * strides,
                                 uint64_t * gs_allocated, uint64_t * gs_aligned, uint64_t * lo_allocated, uint64_t * lo_aligned, uint64_t balanced) {
             this->set_value(std::move(mk_tnsr(transceiver, dtype, rank, allocated, aligned, offset, sizes, strides,
diff --git a/src/idtr.cpp b/src/idtr.cpp
@@ -31,17 +31,33 @@ T * mr_to_ptr(void * ptr, intptr_t offset)
 
 extern "C" {
 
+#define NO_TRANSCEIVER
+#ifdef NO_TRANSCEIVER
+static void initMPIRuntime() {
+    if(getTransceiver() == nullptr)
+      init_transceiver(new MPITransceiver(false));
+}
+#endif
+
 // Return number of ranks/processes in given team/communicator
-uint64_t idtr_nprocs(int64_t team)
+uint64_t idtr_nprocs(Transceiver * tc)
 {
-    return getTransceiver()->nranks();
+#ifdef NO_TRANSCEIVER
+    initMPIRuntime();
+    tc = getTransceiver();
+#endif
+    return tc->nranks();
 }
 #pragma weak _idtr_nprocs = idtr_nprocs
 
 // Return rank in given team/communicator
-uint64_t idtr_prank(int64_t team)
+uint64_t idtr_prank(Transceiver * tc)
 {
-    return getTransceiver()->rank();
+#ifdef NO_TRANSCEIVER
+    initMPIRuntime();
+    tc = getTransceiver();
+#endif
+    return tc->rank();
 }
 #pragma weak _idtr_prank = idtr_prank
 
@@ -173,7 +189,9 @@ void forall(uint64_t d, const T * cptr, const int64_t * sizes, const int64_t * s
         }
     } else {
         for(auto i=0; i<sz; ++i) {
+            const T * tmp = cptr;
             forall(d+1, cptr, sizes, strides, nd, op);
+            cptr = tmp + strides[d];
         }
     }
 }
@@ -190,20 +208,26 @@ bool is_contiguous(const int64_t * sizes, const int64_t * strides, uint64_t nd)
     return true;
 }
 
-void * bufferize(void * cptr, DTypeId dtype, const int64_t * sizes, const int64_t * strides, uint64_t nd, void * out)
-{
-    if(is_contiguous(sizes, strides, nd)) {
-        return cptr;
-    } else {
-        dispatch(dtype, cptr, [sizes, strides, nd, out](auto * ptr) {
-            auto buff = static_cast<decltype(ptr)>(out);
-            forall(0, ptr, sizes, strides, nd, [&buff](const auto * in) {
-                *buff = *in;
-                ++buff;
-            });
-        });
-        return out;
-    }
+void bufferize(void * cptr, DTypeId dtype, const int64_t * sizes, const int64_t * strides, const int64_t * tStarts, const int64_t * tSizes, uint64_t nd, uint64_t N, void * out)
+{    
+    dispatch(dtype, cptr, [sizes, strides, tStarts, tSizes, nd, N, out](auto * ptr) {
+        auto buff = static_cast<decltype(ptr)>(out);
+        
+        for(auto i=0; i<N; ++i) {
+            auto szs = &tSizes[i*nd];
+            if(szs[0] > 0) {
+                auto sts = &tStarts[i*nd];
+                uint64_t off = 0;
+                for(int64_t r=0; r<nd; ++r) {
+                    off += sts[r] * strides[r];
+                }
+                forall(0, &ptr[off], szs, strides, nd, [&buff](const auto * in) {
+                    *buff = *in;
+                    ++buff;
+                });
+            }
+        }
+    });
 }
 
 extern "C" {
@@ -223,6 +247,7 @@ void _idtr_reduce_all(uint64_t rank, void * data, const int64_t * sizes, const i
                     mlir2ddpt(static_cast<imex::ptensor::ReduceOpId>(op)));
 }
 
+#if 0
 void _idtr_rebalance(uint64_t rank, const int64_t * gShape, const int64_t * lOffs,
                      void * data, const int64_t * sizes, const int64_t * strides, int dtype,
                      uint64_t outRank, void * out, const int64_t * outSizes, const int64_t * outStrides)
@@ -269,7 +294,7 @@ void _idtr_rebalance(uint64_t rank, const int64_t * gShape, const int64_t * lOff
     // Finally communicate elements
     getTransceiver()->alltoall(ptr, sszs.data(), soffs.data(), ddpttype, out, rszs.data(), roffs.data());
 }
-
+#endif
 
 /// @brief repartition tensor
 /// We assume tensor is partitioned along the first dimension (only) and partitions are ordered by ranks
@@ -288,18 +313,20 @@ void _idtr_repartition(int64_t rank, int64_t * gShapePtr, int dtype,
                        void * lDataPtr, int64_t * lOffsPtr, int64_t * lShapePtr, int64_t * lStridesPtr,
                        int64_t * offsPtr, int64_t * szsPtr, void * outPtr, Transceiver * tc)
 {
-    assert(is_contiguous(lShapePtr, lStridesPtr, rank));
-
+#ifdef NO_TRANSCEIVER
+    initMPIRuntime();
+    tc = getTransceiver();
+#endif
     auto N = tc->nranks();
     auto me = tc->rank();
     auto ddpttype = mlir2ddpt(static_cast<::imex::ptensor::DType>(dtype));
-    auto nSz = std::accumulate(&lShapePtr[1], &lShapePtr[rank], 1, std::multiplies<int64_t>());
 
     // First we allgather the requested target partitioning
 
     auto myBOff = 2 * rank * me;
     ::std::vector<int64_t> buff(2*rank*N);
     for(int64_t i=0; i<rank; ++i) {
+        // assert(offsPtr[i] - lOffsPtr[i] + szsPtr[i] <= gShapePtr[i]);
         buff[myBOff+i] = offsPtr[i];
         buff[myBOff+i+rank] = szsPtr[i];
     }
@@ -315,24 +342,44 @@ void _idtr_repartition(int64_t rank, int64_t * gShapePtr, int dtype,
     auto myOff = lOffsPtr[0];
     auto mySz = lShapePtr[0];
     auto myEnd = myOff + mySz;
+    auto myTileSz = std::accumulate(&lShapePtr[1], &lShapePtr[rank], 1, std::multiplies<int64_t>());
 
     std::vector<int> soffs(N);
     std::vector<int> sszs(N, 0);
+    std::vector<int64_t> tStarts(N*rank, 0);
+    std::vector<int64_t> tSizes(N*rank, 0);
+    std::vector<int64_t> nSizes(N);
+    int64_t totSSz = 0;
+    bool needsBufferize = !is_contiguous(lShapePtr, lStridesPtr, rank);
 
     for(auto i=0; i<N; ++i) {
+        nSizes[i] = std::accumulate(&buff[2*rank*i+rank+1], &buff[2*rank*i+rank+rank], 1, std::multiplies<int64_t>());
+        if(nSizes[i] != myTileSz) needsBufferize = true;
+    }
+    for(auto i=0; i<N; ++i) {
+        auto nSz = nSizes[i];
         auto tOff = buff[2*rank*i];
         auto tSz = buff[2*rank*i+rank];
         auto tEnd = tOff + tSz;
+
         if(tEnd > myOff && tOff < myEnd) {
             // We have a target partition which is inside my local data
             // we now compute what data goes to this target partition
             auto start = std::max(myOff, tOff);
             auto end = std::min(myEnd, tEnd);
-            soffs[i] = (int)(start - myOff) * nSz;
+            tStarts[i*rank] = start - myOff;
+            tSizes[i*rank] = end - start;
+            soffs[i] = needsBufferize ? (i ? soffs[i-1] + sszs[i-1] : 0) : (int)(start - myOff) * myTileSz;
             sszs[i] = (int)(end - start) * nSz;
         } else {
             soffs[i] = i ? soffs[i-1] + sszs[i-1] : 0;
         }
+        totSSz += sszs[i];
+        for(auto r=1; r<rank; ++r) {
+            tStarts[i*rank+r] = buff[2*rank*i+r];
+            tSizes[i*rank+r] = buff[2*rank*i+rank+r];
+            // assert(tSizes[i*rank+r] <= lShapePtr[r]);
+        }
     }
     
     // send our send sizes to others and receive theirs
@@ -348,7 +395,15 @@ void _idtr_repartition(int64_t rank, int64_t * gShapePtr, int dtype,
     }
 
     // Finally communicate elements
-    getTransceiver()->alltoall(lDataPtr, sszs.data(), soffs.data(), ddpttype, outPtr, rszs.data(), roffs.data());
+    if(needsBufferize) {
+        // create send buffer if strided
+        Buffer buff(totSSz * sizeof_dtype(ddpttype), 2);
+        bufferize(lDataPtr, ddpttype, lShapePtr, lStridesPtr, tStarts.data(), tSizes.data(), rank, N, buff.data());
+        getTransceiver()->alltoall(buff.data(), sszs.data(), soffs.data(), ddpttype, outPtr, rszs.data(), roffs.data());
+        std::cerr << "yey\n";
+    } else {
+        getTransceiver()->alltoall(lDataPtr, sszs.data(), soffs.data(), ddpttype, outPtr, rszs.data(), roffs.data());
+    }
 }
 
 void _idtr_extractslice(int64_t * slcOffs,
@@ -360,13 +415,13 @@ void _idtr_extractslice(int64_t * slcOffs,
                             int64_t * lSlcSizes,
                             int64_t * gSlcOffsets)
 {
-    std::cerr << "slcOffs: " << slcOffs[0] << " " << slcOffs[1] << std::endl;
-    std::cerr << "slcSizes: " << slcSizes[0] << " " << slcSizes[1] << std::endl;
-    std::cerr << "slcStrides: " << slcStrides[0] << " " << slcStrides[1] << std::endl;
-    std::cerr << "tOffs: " << tOffs[0] << " " << tOffs[1] << std::endl;
-    std::cerr << "tSizes: " << tSizes[0] << " " << tSizes[1] << std::endl;
-    std::cerr << "lSlcOffsets: " << lSlcOffsets[0] << " " << lSlcOffsets[1] << std::endl;
-    std::cerr << "lSlcSizes: " << lSlcSizes[0] << " " << lSlcSizes[1] << std::endl;
-    std::cerr << "gSlcOffsets: " << gSlcOffsets[0] << " " << gSlcOffsets[1] << std::endl;
+    if(slcOffs) std::cerr << "slcOffs: " << slcOffs[0] << " " << slcOffs[1] << std::endl;
+    if(slcSizes) std::cerr << "slcSizes: " << slcSizes[0] << " " << slcSizes[1] << std::endl;
+    if(slcStrides) std::cerr << "slcStrides: " << slcStrides[0] << " " << slcStrides[1] << std::endl;
+    if(tOffs) std::cerr << "tOffs: " << tOffs[0] << " " << tOffs[1] << std::endl;
+    if(tSizes) std::cerr << "tSizes: " << tSizes[0] << " " << tSizes[1] << std::endl;
+    if(lSlcOffsets) std::cerr << "lSlcOffsets: " << lSlcOffsets[0] << " " << lSlcOffsets[1] << std::endl;
+    if(lSlcSizes) std::cerr << "lSlcSizes: " << lSlcSizes[0] << " " << lSlcSizes[1] << std::endl;
+    if(gSlcOffsets) std::cerr << "gSlcOffsets: " << gSlcOffsets[0] << " " << gSlcOffsets[1] << std::endl;
 }
 } // extern "C"
diff --git a/src/include/ddptensor/Creator.hpp b/src/include/ddptensor/Creator.hpp
@@ -11,5 +11,5 @@ struct Creator
     static ddptensor * create_from_shape(CreatorId op, const shape_type & shape, DTypeId dtype=FLOAT64);
     static ddptensor * full(const shape_type & shape, const py::object & val, DTypeId dtype=FLOAT64);
     static ddptensor * arange(uint64_t start, uint64_t end, uint64_t step, DTypeId dtype=INT64, uint64_t team=0);
-    static ddptensor * mk_future(const py::object & b);
+    static std::pair<ddptensor *, bool> mk_future(const py::object & b);
 };
diff --git a/src/include/ddptensor/EWBinOp.hpp b/src/include/ddptensor/EWBinOp.hpp
@@ -8,5 +8,5 @@
 
 struct EWBinOp
 {
-    static ddptensor * op(EWBinOpId op, const ddptensor & a, const py::object & b);
+    static ddptensor * op(EWBinOpId op, const py::object & a, const py::object & b);
 };
diff --git a/src/include/ddptensor/NDSlice.hpp b/src/include/ddptensor/NDSlice.hpp
@@ -42,6 +42,7 @@ class NDSlice {
     const vec_t & offsets() const { return _offsets; };
     const vec_t & sizes()   const { return _sizes; };
     const vec_t & strides() const { return _strides; };
+    const uint64_t size() const { return VPROD(_sizes); };
 
     template<typename S>
     void serialize(S & ser)
diff --git a/src/include/ddptensor/idtr.hpp b/src/include/ddptensor/idtr.hpp
diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ def to_numpy(a):`
`50`	`50`	`if not op.startswith("__"):`
`51`	`51`	`OP = op.upper()`
`52`	`52`	`exec(`
`53`		`- f"{op} = lambda this, other: dtensor(_cdt.EWBinOp.op(_cdt.{OP}, this._t, other._t if isinstance(other, ddptensor) else other))"`
	`53`	`+ f"{op} = lambda this, other: dtensor(_cdt.EWBinOp.op(_cdt.{OP}, this._t if isinstance(this, ddptensor) else this, other._t if isinstance(other, ddptensor) else other))"`
`54`	`54`	`)`
`55`	`55`
`56`	`56`	`for op in api.api_categories["EWUnyOp"]:`
Original file line number	Diff line number	Diff line change
`@@ -241,14 +241,14 @@ ddptensor * Creator::arange(uint64_t start, uint64_t end, uint64_t step, DTypeId`
`241`	`241`	`return new ddptensor(defer<DeferredArange>(start, end, step, dtype, team));`
`242`	`242`	`}`
`243`	`243`
`244`		`-ddptensor * Creator::mk_future(const py::object & b)`
	`244`	`+std::pair<ddptensor *, bool> Creator::mk_future(const py::object & b)`
`245`	`245`	`{`
`246`	`246`	`if(py::isinstance<ddptensor>(b)) {`
`247`		`- return b.cast<ddptensor*>();`
	`247`	`+ return {b.cast<ddptensor*>(), false};`
`248`	`248`	`} else if(py::isinstance<py::float_>(b)) {`
`249`		`- return Creator::full({}, b, FLOAT64);`
	`249`	`+ return {Creator::full({}, b, FLOAT64), true};`
`250`	`250`	`} else if(py::isinstance<py::int_>(b)) {`
`251`		`- return Creator::full({}, b, INT64);`
	`251`	`+ return {Creator::full({}, b, INT64), true};`
`252`	`252`	`}`
`253`	`253`	`throw std::runtime_error("Invalid right operand to elementwise binary operation");`
`254`	`254`	`};`
Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,9 @@ struct DeferredIEWBinOp : public Deferred`
`112`	`112`	`ddptensor * IEWBinOp::op(IEWBinOpId op, ddptensor & a, const py::object & b)`
`113`	`113`	`{`
`114`	`114`	`auto bb = Creator::mk_future(b);`
`115`		`- return new ddptensor(defer<DeferredIEWBinOp>(op, a.get(), bb->get()));`
	`115`	`+ auto res = new ddptensor(defer<DeferredIEWBinOp>(op, a.get(), bb.first->get()));`
	`116`	`+ if(bb.second) delete bb.first;`
	`117`	`+ return res;`
`116`	`118`	`}`
`117`	`119`
`118`	`120`	`FACTORY_INIT(DeferredIEWBinOp, F_IEWBINOP);`