introducing dt.from_local for single-process (#38)

fschlimb · web-flow · commit bb830306395a · 2023-08-16T12:03:31.000+02:00
* introducing dt.from_locals for single-process
* get_local -&gt; get_locals
diff --git a/ddptensor/spmd.py b/ddptensor/spmd.py
@@ -1,12 +1,18 @@
 from . import _ddptensor as _cdt
+from . import dtensor
 
 
 def get_slice(obj, *args):
     return _cdt._get_slice(obj._t, *args)
 
 
-def get_local(obj):
-    return _cdt._get_local(obj._t, obj)
+def get_locals(obj):
+    return _cdt._get_locals(obj._t, obj)
+
+
+def from_locals(objs):
+    arg = objs if isinstance(objs, (list, tuple)) else [objs]
+    return dtensor(_cdt._from_locals(arg))
 
 
 def gather(obj, root=_cdt._Ranks._REPLICATED):
diff --git a/src/DDPTensorImpl.cpp b/src/DDPTensorImpl.cpp
@@ -62,14 +62,33 @@ DDPTensorImpl::DDPTensorImpl(const int64_t *shape, uint64_t N, rank_type owner)
   assert(!_transceiver || _transceiver == getTransceiver());
 }
 
+// from numpy
+DDPTensorImpl::DDPTensorImpl(DTypeId dtype, ssize_t ndims, const ssize_t *shape,
+                             const intptr_t *strides, void *data)
+    : _owner(NOOWNER), _gShape(shape, shape + ndims),
+      _lo_allocated(
+          static_cast<uint64_t *>(calloc(ndims, sizeof_dtype(dtype)))),
+      _lo_aligned(_lo_allocated),
+      _lData(ndims, data, data, 0, reinterpret_cast<const intptr_t *>(shape),
+             reinterpret_cast<const intptr_t *>(strides)),
+      _dtype(dtype) {}
+
+void DDPTensorImpl::set_base(const tensor_i::ptr_type &base) {
+  _base = new SharedBaseObject<tensor_i::ptr_type>(base);
+}
+void DDPTensorImpl::set_base(BaseObj *obj) { _base = obj; }
+
 DDPTensorImpl::~DDPTensorImpl() {
   if (!_base) {
     // FIXME it seems possible that halos get reallocated even with when there
-    // is a base _lhsHalo.freeData(); FIXME lhs and rhs can be identical
+    // is a base
+    if (_lhsHalo._allocated != _rhsHalo._allocated)
+      _lhsHalo.freeData(); // lhs and rhs can be identical
     _lData.freeData();
     _rhsHalo.freeData();
   }
   free(_lo_allocated);
+  delete _base;
 }
 
 void *DDPTensorImpl::data() {
@@ -103,8 +122,9 @@ std::string DDPTensorImpl::__repr__() const {
   for (auto i = 0; i < nd; ++i)
     oss << _gShape[i] << (i == nd - 1 ? "" : ", ");
   oss << "), loff=(";
-  for (auto i = 0; i < nd; ++i)
-    oss << _lo_aligned[i] << (i == nd - 1 ? "" : ", ");
+  if (_lo_aligned)
+    for (auto i = 0; i < nd; ++i)
+      oss << _lo_aligned[i] << (i == nd - 1 ? "" : ", ");
   oss << "), lsz=(";
   for (auto i = 0; i < nd; ++i)
     oss << _lData._sizes[i] << (i == nd - 1 ? "" : ", ");
diff --git a/src/IO.cpp b/src/IO.cpp
@@ -5,12 +5,94 @@
 */
 
 #include "ddptensor/IO.hpp"
+#include "ddptensor/DDPTensorImpl.hpp"
 #include "ddptensor/Factory.hpp"
 #include "ddptensor/SetGetItem.hpp"
 #include "ddptensor/Transceiver.hpp"
 #include "ddptensor/TypeDispatch.hpp"
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+// ***************************************************************************
+
+/// @brief form a ddptensor from local numpy arrays (inplace - no copy)
+struct DeferredFromLocal : public Deferred {
+  py::array _npa;
+
+  DeferredFromLocal() = default;
+  DeferredFromLocal(py::array npa)
+      : Deferred(getDTypeId(npa.dtype()),
+                 {npa.shape(), npa.shape() + npa.ndim()}, 0, true),
+        _npa(npa) {}
+
+  // get our DTypeId from py::dtype
+  DTypeId getDTypeId(const py::dtype &dtype) {
+    auto bw = dtype.itemsize();
+    auto kind = dtype.kind();
+    switch (kind) {
+    case 'i':
+      switch (bw) {
+      case 1:
+        return INT8;
+      case 2:
+        return INT16;
+      case 4:
+        return INT32;
+      case 8:
+        return INT64;
+      };
+    case 'f':
+      switch (bw) {
+      case 4:
+        return FLOAT32;
+      case 8:
+        return FLOAT64;
+      };
+    };
+    throw std::runtime_error("Unsupported dtype");
+  }
+
+  void run() override {
+    auto _strides = _npa.strides();
+    auto shape = _npa.shape();
+    auto data = _npa.mutable_data();
+    auto dtype = _npa.dtype();
+    auto ndim = _npa.ndim();
+    auto eSz = dtype.itemsize();
+
+    // py::array stores strides in bytes, not elements
+    std::vector<intptr_t> strides(ndim);
+    for (auto i = 0; i < ndim; ++i) {
+      strides[i] = _strides[i] / eSz;
+    }
+
+    auto res = mk_tnsr(getDTypeId(dtype), ndim, shape, strides.data(), data);
+    // make sure we do not delete numpy's memory before the numpy array is dead
+    // notice: py::objects have ref-counting)
+    res->set_base(new SharedBaseObject<py::object>(_npa));
+    set_value(std::move(res));
+  }
+
+  bool generate_mlir(::mlir::OpBuilder &builder, ::mlir::Location loc,
+                     jit::DepManager &dm) override {
+    return true;
+  }
+
+  FactoryId factory() const { return F_FROMLOCALS; }
+
+  template <typename S> void serialize(S &ser) {}
+};
+
 GetItem::py_future_type IO::to_numpy(const ddptensor &a) {
   assert(!getTransceiver()->is_cw() || getTransceiver()->rank() == 0);
   return GetItem::gather(a, getTransceiver()->is_cw() ? 0 : REPLICATED);
 }
+
+ddptensor *IO::from_locals(const std::vector<py::array> &a) {
+  assert(a.size() == 1);
+  return new ddptensor(defer<DeferredFromLocal>(a.front()));
+}
+
+FACTORY_INIT(DeferredFromLocal, F_FROMLOCALS);
diff --git a/src/SetGetItem.cpp b/src/SetGetItem.cpp
@@ -59,29 +59,29 @@ py::object wrap(DDPTensorImpl::ptr_type tnsr, const py::handle &handle) {
 
 // ***************************************************************************
 
-struct DeferredGetLocal
+struct DeferredGetLocals
     : public DeferredT<GetItem::py_promise_type, GetItem::py_future_type> {
   id_type _a;
   py::handle _handle;
 
-  DeferredGetLocal() = default;
-  DeferredGetLocal(const tensor_i::future_type &a, py::handle &handle)
+  DeferredGetLocals() = default;
+  DeferredGetLocals(const tensor_i::future_type &a, py::handle &handle)
       : _a(a.guid()), _handle(handle) {}
 
   void run() override {
     auto aa = std::move(Registry::get(_a).get());
     auto a_ptr = std::dynamic_pointer_cast<DDPTensorImpl>(aa);
     assert(a_ptr);
     auto res = wrap(a_ptr, _handle);
-    set_value(res);
+    set_value(py::make_tuple(res));
   }
 
   bool generate_mlir(::mlir::OpBuilder &builder, ::mlir::Location loc,
                      jit::DepManager &dm) override {
     return true;
   }
 
-  FactoryId factory() const { return F_GETLOCAL; }
+  FactoryId factory() const { return F_GETLOCALS; }
 
   template <typename S> void serialize(S &ser) {
     ser.template value<sizeof(_a)>(_a);
@@ -345,8 +345,8 @@ ddptensor *GetItem::__getitem__(const ddptensor &a,
   return new ddptensor(defer<DeferredGetItem>(a.get(), std::move(slc)));
 }
 
-GetItem::py_future_type GetItem::get_local(const ddptensor &a, py::handle h) {
-  return defer<DeferredGetLocal>(a.get(), h);
+GetItem::py_future_type GetItem::get_locals(const ddptensor &a, py::handle h) {
+  return defer<DeferredGetLocals>(a.get(), h);
 }
 
 GetItem::py_future_type GetItem::gather(const ddptensor &a, rank_type root) {
@@ -377,4 +377,4 @@ FACTORY_INIT(DeferredGetItem, F_GETITEM);
 FACTORY_INIT(DeferredSetItem, F_SETITEM);
 FACTORY_INIT(DeferredMap, F_MAP);
 FACTORY_INIT(DeferredGather, F_GATHER);
-FACTORY_INIT(DeferredGetLocal, F_GETLOCAL);
+FACTORY_INIT(DeferredGetLocals, F_GETLOCALS);
diff --git a/src/ddptensor.cpp b/src/ddptensor.cpp
@@ -135,10 +135,11 @@ PYBIND11_MODULE(_ddptensor, m) {
       .def("sync", &sync_promises)
       .def("myrank", &myrank)
       .def("_get_slice", &GetItem::get_slice)
-      .def("_get_local",
+      .def("_get_locals",
            [](const ddptensor &f, py::handle h) {
-             PY_SYNC_RETURN(GetItem::get_local(f, h));
+             PY_SYNC_RETURN(GetItem::get_locals(f, h));
            })
+      .def("_from_locals", &IO::from_locals)
       .def("_gather",
            [](const ddptensor &f, rank_type root = REPLICATED) {
              PY_SYNC_RETURN(GetItem::gather(f, root));
diff --git a/src/include/ddptensor/CppTypes.hpp b/src/include/ddptensor/CppTypes.hpp
@@ -199,10 +199,11 @@ enum FactoryId : int {
   F_ARANGE,
   F_EWBINOP,
   F_EWUNYOP,
+  F_FROMLOCALS,
   F_FULL,
   F_GATHER,
   F_GETITEM,
-  F_GETLOCAL,
+  F_GETLOCALS,
   F_IEWBINOP,
   F_LINALGOP,
   F_LINSPACE,
diff --git a/src/include/ddptensor/DDPTensorImpl.hpp b/src/include/ddptensor/DDPTensorImpl.hpp
@@ -17,6 +17,25 @@
 
 class Transceiver;
 
+/// @brief use this to provide a base object to the tensor
+// such a base object can own shared data
+// you might need to implem,ent reference counting
+struct BaseObj {
+  virtual ~BaseObj() {}
+};
+
+/// @brief Simple implementatino of BaseObj for ref-counting types
+/// @tparam T ref-counting type, such as py::object of std::shared_Ptr
+/// we keep an object of the ref-counting type. Normal ref-counting/destructors
+/// will take care of the rest.
+template <typename T> struct SharedBaseObject : public BaseObj {
+  SharedBaseObject(const SharedBaseObject &) = default;
+  SharedBaseObject(SharedBaseObject &&) = default;
+  SharedBaseObject(const T &o) : _base(o) {}
+  SharedBaseObject(T &&o) : _base(std::forward<T>(o)) {}
+  T _base;
+};
+
 /// The actual implementation of the DDPTensor, implementing the tensor_i
 /// interface. It holds the tensor data and some meta information. The member
 /// attributes are mostly inspired by the needs of interacting with MLIR. It
@@ -25,6 +44,7 @@ class Transceiver;
 /// Here, the halos are never used for anything except for interchanging with
 /// MLIR.
 class DDPTensorImpl : public tensor_i {
+
   mutable rank_type _owner;
   Transceiver *_transceiver = nullptr;
   shape_type _gShape = {};
@@ -34,7 +54,7 @@ class DDPTensorImpl : public tensor_i {
   DynMemRef _lData;
   DynMemRef _rhsHalo;
   DTypeId _dtype = DTYPE_LAST;
-  tensor_i::ptr_type _base;
+  BaseObj *_base = nullptr;
 
 public:
   using ptr_type = std::shared_ptr<DDPTensorImpl>;
@@ -63,8 +83,14 @@ class DDPTensorImpl : public tensor_i {
   // incomplete, useful for computing meta information
   DDPTensorImpl() : _owner(REPLICATED) { assert(ndims() <= 1); }
 
+  // From numpy
+  // FIXME multi-proc
+  DDPTensorImpl(DTypeId dtype, ssize_t ndims, const ssize_t *shape,
+                const intptr_t *strides, void *data);
+
   // set the base tensor
-  void set_base(const tensor_i::ptr_type &base) { _base = base; }
+  void set_base(const tensor_i::ptr_type &base);
+  void set_base(BaseObj *obj);
 
   virtual ~DDPTensorImpl();
 
diff --git a/src/include/ddptensor/IO.hpp b/src/include/ddptensor/IO.hpp
@@ -7,7 +7,11 @@
 #pragma once
 
 #include "ddptensor/SetGetItem.hpp"
+#include <pybind11/numpy.h>
+namespace py = pybind11;
+#include <vector>
 
 struct IO {
   static GetItem::py_future_type to_numpy(const ddptensor &a);
+  static ddptensor *from_locals(const std::vector<py::array> &a);
 };
diff --git a/src/include/ddptensor/SetGetItem.hpp b/src/include/ddptensor/SetGetItem.hpp
@@ -19,7 +19,7 @@ struct GetItem {
                                 const std::vector<py::slice> &v);
   static py::object get_slice(const ddptensor &a,
                               const std::vector<py::slice> &v);
-  static py_future_type get_local(const ddptensor &a, py::handle h);
+  static py_future_type get_locals(const ddptensor &a, py::handle h);
   static py_future_type gather(const ddptensor &a, rank_type root);
 };
 
diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp
@@ -178,12 +178,12 @@ ::mlir::Value DepManager::getDependent(::mlir::OpBuilder &builder,
     auto rank = impl->ndims();
     ::mlir::SmallVector<int64_t> lhShape(rank), ownShape(rank), rhShape(rank);
     for (size_t i = 0; i < rank; i++) {
-      lhShape[i] = impl->lh_shape()[i];
+      lhShape[i] = impl->lh_shape() ? impl->lh_shape()[i] : 0;
       ownShape[i] = impl->local_shape()[i];
-      rhShape[i] = impl->rh_shape()[i];
+      rhShape[i] = impl->rh_shape() ? impl->rh_shape()[i] : 0;
     }
     auto typ = getTType(
-        builder, fut.dtype(),
+        builder, impl->dtype(),
         ::mlir::SmallVector<int64_t>(impl->shape(), impl->shape() + rank),
         lhShape, ownShape, rhShape, fut.team(), fut.balanced());
     _func.insertArgument(idx, typ, {}, loc);
diff --git a/test/test_spmd.py b/test/test_spmd.py
@@ -26,9 +26,9 @@ def test_get_slice(self):
         assert c == v
         MPI.COMM_WORLD.barrier()
 
-    def test_get_local(self):
+    def test_get_locals(self):
         a = dt.ones((32, 32), dt.float64)
-        l = dt.spmd.get_local(a)
+        l = dt.spmd.get_locals(a)[0]
         l[0, 0] = 0
         MPI.COMM_WORLD.barrier()
         c = dt.sum(a, [0, 1])
@@ -40,10 +40,10 @@ def test_get_local(self):
         MPI.COMM_WORLD.size == 1 and os.getenv("DDPT_FORCE_DIST", "") == "",
         reason="FIXME extra memref.copy",
     )
-    def test_get_local_of_view(self):
+    def test_get_locals_of_view(self):
         a = dt.ones((32, 32), dt.float64)
         b = a[0:32:2, 0:32:2]
-        l = dt.spmd.get_local(b)
+        l = dt.spmd.get_locals(b)[0]
         assert len(l) > 0
         l[0, 0] = 0
         MPI.COMM_WORLD.barrier()
@@ -87,3 +87,11 @@ def test_gather_strided2(self):
         v = np.sum(np.arange(0, 110, 1, dtype=np.float64)[34:82:2])
         assert float(c) == v
         MPI.COMM_WORLD.barrier()
+
+    @pytest.mark.skipif(MPI.COMM_WORLD.size > 1, reason="FIXME multi-proc")
+    def test_from_locals(self):
+        npa = np.arange(1, 11, 2, dtype=np.int64)
+        a = dt.spmd.from_locals(npa)
+        x = 4711
+        npa[0] = x
+        assert int(a[0]) == x