adding PGAS feature 'get_slice'

fschlimb · fschlimb · commit a6a8d78dd2f1 · 2022-02-04T07:15:54.000-06:00
diff --git a/ddptensor/ddptensor.py b/ddptensor/ddptensor.py
@@ -116,3 +116,6 @@ def __getitem__(self, *args):
 
     def __setitem__(self, key, value):
         x = self._t.__setitem__(key, value._t if isinstance(value, dtensor) else value)
+
+    def get_slice(self, *args):
+        return self._t.get_slice(*args)
diff --git a/src/MPIMediator.cpp b/src/MPIMediator.cpp
@@ -30,6 +30,7 @@ MPIMediator::MPIMediator()
 MPIMediator::~MPIMediator()
 {
     std::cerr << "MPIMediator::~MPIMediator()" << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     Buffer buff;
@@ -40,7 +41,6 @@ MPIMediator::~MPIMediator()
     MPI_Send(buff.data(), buff.size(), MPI_CHAR, rank, PULL_TAG, MPI_COMM_WORLD);
     _listener.join();
     s_ak.clear();
-    MPI_Barrier(MPI_COMM_WORLD);
 }
 
 uint64_t MPIMediator::register_array(tensor_i::ptr_type ary)
@@ -49,7 +49,7 @@ uint64_t MPIMediator::register_array(tensor_i::ptr_type ary)
     return s_last_id;
 }
 
-void MPIMediator::pull(rank_type from, const tensor_i::ptr_type & ary, const NDSlice & slice, void * rbuff)
+void MPIMediator::pull(rank_type from, const tensor_i * ary, const NDSlice & slice, void * rbuff)
 {
     MPI_Comm comm = MPI_COMM_WORLD;
     MPI_Request request[2];
diff --git a/src/ddptensor.cpp b/src/ddptensor.cpp
@@ -50,6 +50,11 @@ class dtensor
         return _tensor->dtype();
     }
 
+    py::object get_slice(const std::vector<py::slice> & v)
+    {
+        return _tensor->get_slice(NDSlice(v));
+    }
+        
     dtensor __getitem__(const NDIndex & v)
     {
         return dtensor(_tensor->__getitem__(NDSlice(v)));
@@ -224,7 +229,8 @@ PYBIND11_MODULE(_ddptensor, m) {
         .def("__getitem__", py::overload_cast<const std::vector<py::slice> &>(&dtensor::__getitem__))
         .def("__getitem__", py::overload_cast<const py::slice &>(&dtensor::__getitem__))
         .def("__getitem__", py::overload_cast<int64_t>(&dtensor::__getitem__))
-        .def("__setitem__", &dtensor::__setitem__);
+        .def("__setitem__", &dtensor::__setitem__)
+        .def("get_slice", &dtensor::get_slice);
 
     //py::class_<dpdlpack>(m, "dpdlpack")
     //    .def("__dlpack__", &dpdlpack.__dlpack__);
diff --git a/src/include/ddptensor/MPIMediator.hpp b/src/include/ddptensor/MPIMediator.hpp
@@ -13,7 +13,7 @@ class MPIMediator : public Mediator
     MPIMediator();
     virtual ~MPIMediator();
     virtual uint64_t register_array(tensor_i::ptr_type ary);
-    virtual void pull(rank_type from, const tensor_i::ptr_type & ary, const NDSlice & slice, void * buffer);
+    virtual void pull(rank_type from, const tensor_i * ary, const NDSlice & slice, void * buffer);
 
 protected:
     void listen();
diff --git a/src/include/ddptensor/Mediator.hpp b/src/include/ddptensor/Mediator.hpp
@@ -13,7 +13,7 @@ class Mediator
 public:
     virtual ~Mediator() {}
     virtual uint64_t register_array(tensor_i::ptr_type ary) = 0;
-    virtual void pull(rank_type from, const tensor_i::ptr_type & ary, const NDSlice & slice, void * buffer) = 0;
+    virtual void pull(rank_type from, const tensor_i * ary, const NDSlice & slice, void * buffer) = 0;
 };
 
 extern Mediator * theMediator;
diff --git a/src/include/ddptensor/PVSlice.hpp b/src/include/ddptensor/PVSlice.hpp
@@ -7,6 +7,8 @@
 
 using offsets_type = std::vector<uint64_t>;
 
+constexpr static int NOSPLIT = -1;
+
 class BasePVSlice
 {
     uint64_t   _offset;
@@ -18,13 +20,13 @@ class BasePVSlice
     BasePVSlice(const BasePVSlice &) = delete;
     BasePVSlice(BasePVSlice &&) = default;
     BasePVSlice(const shape_type & shape, int split=0)
-        : _offset((shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),
+        : _offset(split == NOSPLIT ? 0 : (shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),
           _shape(shape),
           _split_dim(split)
     {
     }
     BasePVSlice(shape_type && shape, int split=0)
-        : _offset((shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),
+        : _offset(split == NOSPLIT ? 0 : (shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),
           _shape(std::move(shape)),
           _split_dim(split)
     {
@@ -35,6 +37,9 @@ class BasePVSlice
     const shape_type & shape() const { return _shape; }
     shape_type shape(rank_type rank) const
     {
+        if(split_dim() == NOSPLIT) {
+            return rank == theTransceiver->rank() ? _shape : shape_type();
+        }
         shape_type shp(_shape);
         auto end = (rank+1) * _offset;
         if(end <= _shape[_split_dim]) shp[_split_dim] = _offset;
@@ -43,7 +48,7 @@ class BasePVSlice
     }
     rank_type owner(const NDSlice & slice) const
     {
-        return slice.dim(split_dim())._start / offset();
+        return split_dim() == NOSPLIT ? theTransceiver->rank() : slice.dim(split_dim())._start / offset();
     }
 };
 
@@ -150,10 +155,12 @@ class PVSlice
         return _slice;
     }
 
+#if 0
     NDSlice normalized_slice() const
     {
         return _slice.normalize(_base->split_dim());
     }
+#endif
 
     NDSlice map_slice(const NDSlice & slc) const
     {
@@ -162,11 +169,17 @@ class PVSlice
 
     NDSlice slice_of_rank(rank_type rank) const
     {
+        if(_base->split_dim() == NOSPLIT) {
+            return rank == theTransceiver->rank() ? slice() : NDSlice();
+        }
         return _slice.trim(_base->split_dim(), rank * _base->offset(), (rank+1) * _base->offset());
     }
 
     NDSlice local_slice_of_rank(rank_type rank) const
     {
+        if(_base->split_dim() == NOSPLIT) {
+            return rank == theTransceiver->rank() ? slice() : NDSlice();
+        }
         return _slice.trim_shift(_base->split_dim(),
                                  rank * _base->offset(),
                                  (rank+1) * _base->offset(),
@@ -175,6 +188,7 @@ class PVSlice
 
     bool need_reduce(const dim_vec_type & dims) const
     {
+        if(_base->split_dim() == NOSPLIT) return false;
         auto nd = dims.size();
         // Reducing to a single scalar or over a subset of dimensions *including* the split axis.
         if(nd == 0
diff --git a/src/include/ddptensor/ddptensor_impl.hpp b/src/include/ddptensor/ddptensor_impl.hpp
@@ -224,13 +224,17 @@ class dtensor_impl : public tensor_i
     }
 
     // since the API works on tensor_i we need to downcast to the actual type
-    const dtensor_impl<T> * cast(const ptr_type & b) const
+    static dtensor_impl<T> * cast(ptr_type & b)
     {
         // FIXME; use attribute/vfunction + reinterpret_cast for better performance
-        auto ptr = dynamic_cast<const dtensor_impl<T>*>(b.get());
+        auto ptr = dynamic_cast<dtensor_impl<T>*>(b.get());
         // if(ptr == nullptr) throw(std::runtime_error("Incompatible tensor types."));
         return ptr;
     }
+    static const dtensor_impl<T> * cast(const ptr_type & b)
+    {
+        return cast(const_cast<ptr_type &>(b));
+    }
 
     ptr_type _ew_op(const char * op, const char * mod, py::args args, const py::kwargs & kwargs)
     {
@@ -331,42 +335,28 @@ class dtensor_impl : public tensor_i
         }
     }
 
-    // FIXME We use a generic SPMD/PGAS mechanism to pull elements from remote
-    // on all procs simultaneously.  Since __setitem__ is collective we could
-    // implement a probaly more efficient mechanism which pushes data and/or using RMA.
-    void __setitem__(const NDSlice & slice, const ptr_type & val)
+    // copy data from val into (*dest)[slice]
+    // this is a non-collective call.
+    static void _set_slice(const dtensor_impl<T> * val, const NDSlice & val_slice, dtensor_impl<T> * dest, const NDSlice & dest_slice)
     {
-        std::cerr << " __setitem__ " << slice << " " << val->pvslice().slice() << std::endl;
-        auto nd = shape().size();
-        if(owner() == REPLICATED && nd > 0)
+        std::cerr << "_set_slice " << val_slice << " " << dest_slice << std::endl;
+        auto nd = dest->shape().size();
+        if(dest->owner() == REPLICATED && nd > 0)
             std::cerr << "Warning: __setitem__ on replicated data updates local tile only" << std::endl;
-        if(nd != slice.ndims())
+        if(nd != dest_slice.ndims())
             throw std::runtime_error("Index dimensionality must match array dimensionality");
+        if(val_slice.size() != dest_slice.size())
+            throw std::runtime_error("Input and output slices must be of same size");
 
-        auto slc_sz = slice.size();
-        auto val_sz = VPROD(val->shape());
-        if(slc_sz != val_sz)
-            throw std::runtime_error("Given tensor does not match: it has different size than 'slice'");
-
-        NDSlice norm_slice = pvslice().normalized_slice();
-        std::cerr << "norm_slice: " << norm_slice << std::endl;
         // Use given slice to create a global view into orig array
-        PVSlice g_slc_view(pvslice(), slice);
+        PVSlice g_slc_view(dest->pvslice(), dest_slice);
         std::cerr << "g_slice: " << g_slc_view.slice() << std::endl;
-        PVSlice my_view(g_slc_view, theTransceiver->rank());
-        NDSlice my_slice = my_view.slice();
-        std::cerr << "my_slice: " << my_slice << std::endl;
-        NDSlice my_norm_slice = g_slc_view.map_slice(my_slice);
-        std::cerr << "my_norm_slice: " << my_norm_slice << std::endl;
-
         // Create a view into val
-        PVSlice needed_val_view(val->pvslice(), my_norm_slice);
+        PVSlice needed_val_view(val->pvslice(), val_slice);
         std::cerr << "needed_val_view: " << needed_val_view.slice() << " (was " << val->pvslice().slice() << ")" << std::endl;
 
         // Get the pointer to the local buffer
-        auto ns = get_array_impl(_pyarray);
-        //auto my_binfo = _pyarray.cast<py::buffer>().request();
-        // T * my_buffer = reinterpret_cast<T*>(my_binfo.ptr);
+        auto ns = get_array_impl(dest->_pyarray);
 
         // we can now compute which ranks actually hold which piece of the data from val that we need locally
         for(rank_type i=0; i<theTransceiver->nranks(); ++i ) {
@@ -377,7 +367,7 @@ class dtensor_impl : public tensor_i
             std::cerr << i << " curr_needed_val_slice: " << curr_needed_val_slice << std::endl;
             NDSlice curr_local_val_slice = val_local_view.map_slice(curr_needed_val_slice);
             std::cerr << i << " curr_local_val_slice: " << curr_local_val_slice << std::endl;
-            NDSlice curr_needed_norm_slice = val->pvslice().map_slice(curr_needed_val_slice);
+            NDSlice curr_needed_norm_slice = needed_val_view.map_slice(curr_needed_val_slice);
             std::cerr << i << " curr_needed_norm_slice: " << curr_needed_norm_slice << std::endl;
             PVSlice my_curr_needed_view = PVSlice(g_slc_view, curr_needed_norm_slice);
             std::cerr << i << " my_curr_needed_slice: " << my_curr_needed_view.slice() << std::endl;
@@ -387,23 +377,39 @@ class dtensor_impl : public tensor_i
                 py::tuple tpl = _make_tuple(my_curr_local_slice); //my_curr_view.slice());
                 if(i == theTransceiver->rank()) {
                     // copy locally
-                    auto rhs = cast(val)->_pyarray.attr("__getitem__")(_make_tuple(curr_local_val_slice));
+                    auto rhs = val->_pyarray.attr("__getitem__")(_make_tuple(curr_local_val_slice));
                     std::cerr << py::str(rhs).cast<std::string>() << std::endl;
-                    _pyarray.attr("__setitem__")(tpl, rhs);
+                    dest->_pyarray.attr("__setitem__")(tpl, rhs);
                 } else {
                     // pull slice directly into new array
                     auto obj = ns.attr("empty")(_make_tuple(curr_local_val_slice.shape()));
                     auto binfo = obj.cast<py::buffer>().request();
                     theMediator->pull(i, val, curr_local_val_slice, binfo.ptr);
-                    _pyarray.attr("__setitem__")(tpl, obj);
+                    dest->_pyarray.attr("__setitem__")(tpl, obj);
                 }
             }
         }
     }
 
+    // FIXME We use a generic SPMD/PGAS mechanism to pull elements from remote
+    // on all procs simultaneously.  Since __setitem__ is collective we could
+    // implement a probaly more efficient mechanism which pushes data and/or using RMA.
+    void __setitem__(const NDSlice & slice, const ptr_type & val)
+    {
+        // Use given slice to create a global view into orig array
+        PVSlice g_slc_view(this->pvslice(), slice);
+        std::cerr << "g_slice: " << g_slc_view.slice() << std::endl;
+        NDSlice my_slice = g_slc_view.slice_of_rank(theTransceiver->rank());
+        std::cerr << "my_slice: " << my_slice << std::endl;
+        NDSlice my_norm_slice = g_slc_view.map_slice(my_slice);
+        std::cerr << "my_norm_slice: " << my_norm_slice << std::endl;
+
+        _set_slice(cast(val), my_norm_slice, this, my_slice);
+    }
+
     void bufferize(const NDSlice & slice, Buffer & buff)
     {
-        PVSlice my_local_view = PVSlice(tile_shape()); // pvslice().view_normalized_by_rank(theTransceiver->rank());
+        PVSlice my_local_view = PVSlice(tile_shape());
         PVSlice lview = PVSlice(my_local_view, slice);
         NDSlice lslice = lview.slice();
 
@@ -422,6 +428,14 @@ class dtensor_impl : public tensor_i
         }
     }
 
+    py::object get_slice(const NDSlice & slice) const
+    {
+        auto shp = slice.shape();
+        auto out = create_dtensor(PVSlice(shp, NOSPLIT), shp, DTYPE<T>::value, "empty");
+        _set_slice(this, slice, cast(out), {shp});
+        return cast(out)->_pyarray;
+    }
+
     std::string __repr__() const
     {
         return "dtensor(shape=" + to_string(shape(), 'x') + ", n_tiles="
diff --git a/src/include/ddptensor/tensor_i.hpp b/src/include/ddptensor/tensor_i.hpp
@@ -44,4 +44,6 @@ class tensor_i
     virtual void _ew_binary_op_inplace(const char * op, const ptr_type & b) = 0;
     virtual void _ew_binary_op_inplace(const char * op, const py::object & b) = 0;
     virtual ptr_type _reduce_op(const char * op, const dim_vec_type & dims) const = 0;
+
+    virtual py::object get_slice(const NDSlice & slice) const = 0;
 };
diff --git a/test/test_spmd.py b/test/test_spmd.py
@@ -0,0 +1,8 @@
+from mpi4py import MPI
+import ddptensor as dt
+a = dt.ones((8,8))
+MPI.COMM_WORLD.barrier()
+b = a.get_slice((slice(1, 3+MPI.COMM_WORLD.rank), slice(2, 4+MPI.COMM_WORLD.rank)))
+print(type(b), b.shape, float(b[1,1]))
+print("done")
+dt.fini()

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ MPIMediator::MPIMediator()`
`30`	`30`	`MPIMediator::~MPIMediator()`
`31`	`31`	`{`
`32`	`32`	`std::cerr << "MPIMediator::~MPIMediator()" << std::endl;`
	`33`	`+ MPI_Barrier(MPI_COMM_WORLD);`
`33`	`34`	`int rank;`
`34`	`35`	`MPI_Comm_rank(MPI_COMM_WORLD, &rank);`
`35`	`36`	`Buffer buff;`
`@@ -40,7 +41,6 @@ MPIMediator::~MPIMediator()`
`40`	`41`	`MPI_Send(buff.data(), buff.size(), MPI_CHAR, rank, PULL_TAG, MPI_COMM_WORLD);`
`41`	`42`	`_listener.join();`
`42`	`43`	`s_ak.clear();`
`43`		`- MPI_Barrier(MPI_COMM_WORLD);`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`uint64_t MPIMediator::register_array(tensor_i::ptr_type ary)`
`@@ -49,7 +49,7 @@ uint64_t MPIMediator::register_array(tensor_i::ptr_type ary)`
`49`	`49`	`return s_last_id;`
`50`	`50`	`}`
`51`	`51`
`52`		`-void MPIMediator::pull(rank_type from, const tensor_i::ptr_type & ary, const NDSlice & slice, void * rbuff)`
	`52`	`+void MPIMediator::pull(rank_type from, const tensor_i * ary, const NDSlice & slice, void * rbuff)`
`53`	`53`	`{`
`54`	`54`	`MPI_Comm comm = MPI_COMM_WORLD;`
`55`	`55`	`MPI_Request request[2];`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,8 @@`
`7`	`7`
`8`	`8`	`using offsets_type = std::vector<uint64_t>;`
`9`	`9`
	`10`	`+constexpr static int NOSPLIT = -1;`
	`11`	`+`
`10`	`12`	`class BasePVSlice`
`11`	`13`	`{`
`12`	`14`	`uint64_t _offset;`
`@@ -18,13 +20,13 @@ class BasePVSlice`
`18`	`20`	`BasePVSlice(const BasePVSlice &) = delete;`
`19`	`21`	`BasePVSlice(BasePVSlice &&) = default;`
`20`	`22`	`BasePVSlice(const shape_type & shape, int split=0)`
`21`		`- : _offset((shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),`
	`23`	`+ : _offset(split == NOSPLIT ? 0 : (shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),`
`22`	`24`	`_shape(shape),`
`23`	`25`	`_split_dim(split)`
`24`	`26`	`{`
`25`	`27`	`}`
`26`	`28`	`BasePVSlice(shape_type && shape, int split=0)`
`27`		`- : _offset((shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),`
	`29`	`+ : _offset(split == NOSPLIT ? 0 : (shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),`
`28`	`30`	`_shape(std::move(shape)),`
`29`	`31`	`_split_dim(split)`
`30`	`32`	`{`
`@@ -35,6 +37,9 @@ class BasePVSlice`
`35`	`37`	`const shape_type & shape() const { return _shape; }`
`36`	`38`	`shape_type shape(rank_type rank) const`
`37`	`39`	`{`
	`40`	`+ if(split_dim() == NOSPLIT) {`
	`41`	`+ return rank == theTransceiver->rank() ? _shape : shape_type();`
	`42`	`+ }`
`38`	`43`	`shape_type shp(_shape);`
`39`	`44`	`auto end = (rank+1) * _offset;`
`40`	`45`	`if(end <= _shape[_split_dim]) shp[_split_dim] = _offset;`
`@@ -43,7 +48,7 @@ class BasePVSlice`
`43`	`48`	`}`
`44`	`49`	`rank_type owner(const NDSlice & slice) const`
`45`	`50`	`{`
`46`		`- return slice.dim(split_dim())._start / offset();`
	`51`	`+ return split_dim() == NOSPLIT ? theTransceiver->rank() : slice.dim(split_dim())._start / offset();`
`47`	`52`	`}`
`48`	`53`	`};`
`49`	`54`
`@@ -150,10 +155,12 @@ class PVSlice`
`150`	`155`	`return _slice;`
`151`	`156`	`}`
`152`	`157`
	`158`	`+#if 0`
`153`	`159`	`NDSlice normalized_slice() const`
`154`	`160`	`{`
`155`	`161`	`return _slice.normalize(_base->split_dim());`
`156`	`162`	`}`
	`163`	`+#endif`
`157`	`164`
`158`	`165`	`NDSlice map_slice(const NDSlice & slc) const`
`159`	`166`	`{`
`@@ -162,11 +169,17 @@ class PVSlice`
`162`	`169`
`163`	`170`	`NDSlice slice_of_rank(rank_type rank) const`
`164`	`171`	`{`
	`172`	`+ if(_base->split_dim() == NOSPLIT) {`
	`173`	`+ return rank == theTransceiver->rank() ? slice() : NDSlice();`
	`174`	`+ }`
`165`	`175`	`return _slice.trim(_base->split_dim(), rank * _base->offset(), (rank+1) * _base->offset());`
`166`	`176`	`}`
`167`	`177`
`168`	`178`	`NDSlice local_slice_of_rank(rank_type rank) const`
`169`	`179`	`{`
	`180`	`+ if(_base->split_dim() == NOSPLIT) {`
	`181`	`+ return rank == theTransceiver->rank() ? slice() : NDSlice();`
	`182`	`+ }`
`170`	`183`	`return _slice.trim_shift(_base->split_dim(),`
`171`	`184`	`rank * _base->offset(),`
`172`	`185`	`(rank+1) * _base->offset(),`
`@@ -175,6 +188,7 @@ class PVSlice`
`175`	`188`
`176`	`189`	`bool need_reduce(const dim_vec_type & dims) const`
`177`	`190`	`{`
	`191`	`+ if(_base->split_dim() == NOSPLIT) return false;`
`178`	`192`	`auto nd = dims.size();`
`179`	`193`	`// Reducing to a single scalar or over a subset of dimensions including the split axis.`
`180`	`194`	`if(nd == 0`