adding spmd.get_local

fschlimb · fschlimb · commit 181d4634486c · 2022-03-11T05:04:51.000-06:00
diff --git a/ddptensor/spmd.py b/ddptensor/spmd.py
@@ -1,4 +1,7 @@
 from . import _ddptensor as _cdt
 
-def get_slice(self, *args):
-    return _cdt._get_slice(self._t, *args)
+def get_slice(obj, *args):
+    return _cdt._get_slice(obj._t, *args)
+
+def get_local(obj):
+    return  _cdt._get_local(obj._t, obj)
diff --git a/src/Creator.cpp b/src/Creator.cpp
@@ -14,7 +14,7 @@ namespace x {
         static ptr_type op(CreatorId c, const shape_type & shp)
         {
             PVSlice pvslice(shp);
-            shape_type shape(std::move(pvslice.tile_shape()));
+            shape_type shape(std::move(pvslice.shape_of_rank()));
             switch(c) {
             case EMPTY:
                 return operatorx<T>::mk_tx(std::move(pvslice), std::move(xt::empty<T>(std::move(shape))));
@@ -32,7 +32,7 @@ namespace x {
         {
             if(c == FULL) {
                 PVSlice pvslice(shp);
-                shape_type shape(std::move(pvslice.tile_shape()));
+                shape_type shape(std::move(pvslice.shape_of_rank()));
                 auto a = xt::empty<T>(std::move(shape));
                 a.fill(to_native<T>(v));
                 return operatorx<T>::mk_tx(std::move(pvslice), std::move(a));
diff --git a/src/ManipOp.cpp b/src/ManipOp.cpp
@@ -0,0 +1,29 @@
+#include <mpi.h>
+#include "ddptensor/ManipOp.hpp"
+#include "ddptensor/TypeDispatch.hpp"
+#include "ddptensor/x.hpp"
+#include "ddptensor/CollComm.hpp"
+
+namespace x {
+
+    class ManipOp
+    {
+    public:
+        using ptr_type = DPTensorBaseX::ptr_type;
+
+        // Reshape
+        // For now we always create a copy/new array.
+        template<typename T>
+        static ptr_type op(const shape_type & shape, const std::shared_ptr<DPTensorX<T>> & a_ptr)
+        {
+            auto b_ptr = x::operatorx<T>::mk_tx(shape);
+            CollComm::coll_copy(b_ptr, a_ptr);
+            return b_ptr;
+        }
+    };
+}
+
+tensor_i::ptr_type ManipOp::reshape(x::DPTensorBaseX::ptr_type a, const shape_type & shape)
+{
+    return TypeDispatch<x::ManipOp>(a, shape);
+}
diff --git a/src/Random.cpp b/src/Random.cpp
@@ -14,7 +14,7 @@ namespace x {
         {
             if constexpr (std::is_floating_point<T>::value) {
                 PVSlice pvslice(shp);
-                shape_type shape(std::move(pvslice.tile_shape()));
+                shape_type shape(std::move(pvslice.shape_of_rank()));
                 return operatorx<T>::mk_tx(std::move(pvslice), std::move(xt::random::rand(std::move(shape), to_native<T>(lower), to_native<T>(upper))));
             }
         }
diff --git a/src/SetGetItem.cpp b/src/SetGetItem.cpp
@@ -111,37 +111,60 @@ namespace x {
     public:
         using ptr_type = DPTensorBaseX::ptr_type;
 
+        // get_slice
         template<typename T>
         static py::object op(const NDSlice & slice, const std::shared_ptr<DPTensorX<T>> & a_ptr)
         {
             auto shp = slice.shape();
             auto sz = VPROD(shp);
             auto res = py::array_t<T>(sz);
             auto ax = xt::adapt(res.mutable_data(), sz, xt::no_ownership(), shp);
-            std::cerr << ax << std::endl << py::str(res).cast<std::string>() << res.mutable_data() << std::endl;
-            // Create dtensor without creating id: do not use create_dtensor
-            // auto out = DPTensorX<T>(ax, PVSlice(shp, NOSPLIT));
             PVSlice slc{shp, NOSPLIT};
             SetItem::_set_slice<T>(ax, slc, slc.slice(), a_ptr, slice);
-            std::cerr << ax << std::endl << py::str(res).cast<std::string>() << std::endl;
-            // res.reshape(shp);
             return res;
         }
+
+        // get_local
+        template<typename T>
+        static py::object op(py::handle & handle, const std::shared_ptr<DPTensorX<T>> & a_ptr)
+        {
+            auto slc = a_ptr->slice().local_slice_of_rank();
+            auto tshp = a_ptr->slice().tile_shape();
+            auto nd = slc.ndims();
+             // buffer protocol accepts strides in number of bytes not elements!
+            std::vector<uint64_t> strides(nd, sizeof(T));
+            uint64_t off = slc.dim(nd-1)._start * sizeof(T); // start offset
+            for(int i=nd-2; i>=0; --i) {
+                auto slci = slc.dim(i);
+                auto tmp = strides[i+1] * tshp[i+1];
+                strides[i] = slci._step * tmp;
+                off += slci._start * tmp;
+            }
+            off /= sizeof(T); // we need the offset in number of elements
+            strides.back() = slc.dim(nd-1)._step * sizeof(T);
+            T * data = a_ptr->xarray().data();
+            return py::array(std::move(slc.shape()), std::move(strides), data + off, handle);
+        }
     };
 
 } // namespace x
 
-void SetItem::__setitem__(x::DPTensorBaseX::ptr_type a, const std::vector<py::slice> & v, x::DPTensorBaseX::ptr_type b)
+void SetItem::__setitem__(tensor_i::ptr_type a, const std::vector<py::slice> & v, tensor_i::ptr_type b)
 {
     return TypeDispatch<x::SetItem>(a, b, NDSlice(v));
 }
 
-tensor_i::ptr_type GetItem::__getitem__(x::DPTensorBaseX::ptr_type a, const std::vector<py::slice> & v)
+tensor_i::ptr_type GetItem::__getitem__(tensor_i::ptr_type a, const std::vector<py::slice> & v)
 {
     return TypeDispatch<x::GetItem>(a, NDSlice(v));
 }
 
-py::object GetItem::get_slice(x::DPTensorBaseX::ptr_type a, const std::vector<py::slice> & v)
+py::object GetItem::get_slice(tensor_i::ptr_type a, const std::vector<py::slice> & v)
 {
     return TypeDispatch<x::SPMD>(a, NDSlice(v));
 }
+
+py::object GetItem::get_local(tensor_i::ptr_type a, py::handle h)
+{
+    return TypeDispatch<x::SPMD>(a, h);
+}
diff --git a/src/ddptensor.cpp b/src/ddptensor.cpp
@@ -62,7 +62,8 @@ PYBIND11_MODULE(_ddptensor, m) {
 
     m.def("fini", &fini)
         .def("myrank", &myrank)
-        .def("_get_slice", &GetItem::get_slice);
+        .def("_get_slice", &GetItem::get_slice)
+        .def("_get_local", &GetItem::get_local);
 
     py::class_<Creator>(m, "Creator")
         .def("create_from_shape", &Creator::create_from_shape)
diff --git a/src/include/ddptensor/CollComm.hpp b/src/include/ddptensor/CollComm.hpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include "UtilsAndTypes.hpp"
+#include "x.hpp"
+
+struct CollComm
+{
+    // We assume we split in first dimension.
+    // We also assume partitions are assigned to ranks in sequence from 0-N.
+    // With this we know that our buffers (old and new) get data in the
+    // same order. The only thing which might have changed is the tile-size.
+    // Actually, the tile-size might change only if old or new shape does not evenly
+    // distribute data (e.g. last partition is smaller).
+    // In theory we could re-shape in-place when the norm-tile-size does not change.
+    // This is not implemented: we need an extra mechanism to work with reshape-views or alike.
+    template<typename T, typename U>
+    static tensor_i::ptr_type coll_copy(std::shared_ptr<x::DPTensorX<T>> b_ptr, const std::shared_ptr<x::DPTensorX<U>> & a_ptr)
+    {
+        assert(! a_ptr->is_sliced() && ! b_ptr->is_sliced());
+
+        auto o_slc = a_ptr->slice();
+        // norm tile-size of  orig array
+        auto o_ntsz =  o_slc.tile_size(0);
+        // tilesize of my local partition of orig array
+        auto o_tsz = o_slc.tile_size();
+        // linearized local slice of orig array
+        auto o_llslc = Slice(o_ntsz * theTransceiver->rank(), o_ntsz * theTransceiver->rank() + o_tsz);
+            
+        PVSlice n_slc = b_ptr->slice();
+        // norm tile-size of new (reshaped) array
+        auto n_ntsz =  n_slc.tile_size(0);
+        // tilesize of my local partition of new (reshaped) array
+        auto n_tsz = n_slc.tile_size();
+        // linearized/flattened/1d local slice of new (reshaped) array
+        auto n_llslc = Slice(n_ntsz * theTransceiver->rank(), n_ntsz * theTransceiver->rank() + n_tsz);
+            
+        auto nr = theTransceiver->nranks();
+        // We need a few C-arrays for MPI (counts and displacements in send/recv buffers)
+        int counts_send[nr] = {0};
+        int disp_send[nr] = {0};
+        int counts_recv[nr] = {0};
+        int disp_recv[nr] = {0};
+
+        for(auto r=0; r<nr; ++r) {
+            // determine what I receive from rank r
+            // e.g. which parts of my new slice overlap with rank r's old slice
+            // Get local slice of rank r of orig array
+            auto o_rslc = o_slc.local_slice_of_rank(r);
+            // Flatten to 1d
+            auto o_lrslc = Slice(o_ntsz * r, o_ntsz * r + o_rslc.size());
+            // Determine overlap with local partition of linearized new array
+            auto roverlap = n_llslc.overlap(o_lrslc);
+            // number of elements to be received from rank r
+            counts_recv[r] = roverlap.size();
+            // displacement in new array where elements from rank r get copied to
+            disp_recv[r] = roverlap._start - n_llslc._start;
+
+            // determine what I send to rank r
+            // e.g. which parts of my old slice overlap with rank r's new slice
+            // Get local slice of rank r of new array
+            auto n_rslc = n_slc.local_slice_of_rank(r);
+            // Flatten to 1d
+            auto n_lrslc = Slice(n_ntsz * r, n_ntsz * r + n_rslc.size());
+            // Determine overlap with local partition of linearized orig array
+            auto soverlap = o_llslc.overlap(n_lrslc);
+            // number of elements to be send to rank r
+            counts_send[r] = soverlap.size();
+            // displacement in orig array where elements from rank r get copied from
+            disp_send[r] = soverlap._start - o_llslc._start;
+        }
+
+        // Now we can send/recv directly to/from xarray buffers.
+        theTransceiver->alltoall(a_ptr->xarray().data(), // buffer_send
+                                 counts_send,
+                                 disp_send,
+                                 DTYPE<T>::value,
+                                 b_ptr->xarray().data(), // buffer_recv
+                                 counts_recv,
+                                 disp_recv,
+                                 DTYPE<T>::value);
+            
+        return b_ptr;
+    }
+};
diff --git a/src/include/ddptensor/NDIndex.hpp b/src/include/ddptensor/NDIndex.hpp
@@ -9,6 +9,7 @@
 ///
 typedef std::vector<int64_t> NDIndex;
 
+#if 0
 ///
 /// @return tile-sizes for each dimension, as if leading dimensions were cut.
 /// @param tile_shape tile-shape in question
@@ -42,3 +43,4 @@ uint64_t linearize(const std::vector<T> & idx, const std::vector<uint64_t> & tss
     }
     return tidx;
 }
+#endif
diff --git a/src/include/ddptensor/NDSlice.hpp b/src/include/ddptensor/NDSlice.hpp
@@ -97,10 +97,10 @@ class NDSlice {
     ///
     /// @return total number of elements represented by the nd-slice
     ///
-    value_type::value_type size() const
+    value_type::value_type size(uint64_t dim = 0) const
     {
         if(_sizes.empty()) init_sizes();
-        return _sizes[0];
+        return _sizes[dim];
     }
 
     ///
@@ -120,29 +120,6 @@ class NDSlice {
         _sizes.resize(0);
     }
 
-    ///
-    /// @return ith index-tuple in canonical (flat) order of the expanded slice.
-    /// does not check bounds, e.g. can return indices beyond end of slice
-    ///
-    value_type operator[](value_type::value_type i) const {
-        if(_sizes.empty()) init_sizes();
-        value_type ret(_slice_vec.size(), 0);
-        auto sz = ++(_sizes.begin());
-        auto slc = _slice_vec.rbegin();
-        // iterate over dimensions to compute ith index
-        for(auto v = ret.begin(); v != ret.end(); ++v, ++slc) {
-            if(sz != _sizes.end()) {
-                auto idx = i / (*sz);
-                *v = (*slc)[idx];
-                i -= idx * (*sz);
-                ++sz;
-            } else {
-                *v = (*slc)[i];
-            }
-        }
-        return ret;
-    }
-
     template<typename C>
     NDSlice _convert(const C & conv) const
     {
diff --git a/src/include/ddptensor/PVSlice.hpp b/src/include/ddptensor/PVSlice.hpp
@@ -36,7 +36,18 @@ class BasePVSlice
     }
 
     uint64_t offset() const { return _offset; }
-    uint64_t tile_size() const { return _tile_size; }
+    uint64_t tile_size(rank_type rank = theTransceiver->rank()) const
+    {
+        if(rank < theTransceiver->nranks() - 1) return _tile_size;
+        return VPROD(_shape) - (rank-1 * _tile_size);
+    }
+    shape_type tile_shape(rank_type rank = theTransceiver->rank()) const
+    {
+        shape_type r(_shape);
+        if(rank < theTransceiver->nranks() - 1) r[_split_dim] = offset();
+        else r[_split_dim] = r[_split_dim] - (rank-1 * offset());
+        return r;
+    }
     int split_dim() const { return _split_dim; }
     const shape_type & shape() const { return _shape; }
     shape_type shape(rank_type rank) const
@@ -129,9 +140,14 @@ class PVSlice
         return _base->split_dim();
     }
 
-    const uint64_t tile_size() const
+    const bool is_sliced() const
+    {
+        return base_shape() != shape();
+    }
+
+    const uint64_t tile_size(rank_type rank = theTransceiver->rank()) const
     {
-        return _base->tile_size();
+        return _base->tile_size(rank);
     }
 
     const shape_type & shape() const
@@ -145,6 +161,11 @@ class PVSlice
     }
 
     const shape_type tile_shape(rank_type rank = theTransceiver->rank()) const
+    {
+        return _base->tile_shape(rank);
+    }
+
+    const shape_type shape_of_rank(rank_type rank = theTransceiver->rank()) const
     {
         return slice_of_rank(rank).shape();
     }
diff --git a/src/include/ddptensor/SetGetItem.hpp b/src/include/ddptensor/SetGetItem.hpp
@@ -10,6 +10,7 @@ struct GetItem
 {
     static tensor_i::ptr_type __getitem__(tensor_i::ptr_type a, const std::vector<py::slice> & v);
     static py::object get_slice(tensor_i::ptr_type a, const std::vector<py::slice> & v);
+    static py::object get_local(tensor_i::ptr_type a, py::handle h);
 };
 
 struct SetItem
diff --git a/src/include/ddptensor/x.hpp b/src/include/ddptensor/x.hpp
@@ -48,7 +48,6 @@ namespace x
         xt::xstrided_slice_vector _lslice;
         std::shared_ptr<xt::xarray<T>> _xarray;
         mutable T _replica = 0;
-        bool _issliced = false;
 
     public:
         using typed_ptr_type = std::shared_ptr<DPTensorX<T>>;
@@ -80,7 +79,7 @@ namespace x
         DPTensorX(const shape_type & shp, rank_type owner=NOOWNER)
             : _owner(owner),
               _slice(shp),
-              _xarray(std::make_shared<xt::xarray<T>>(xt::empty<T>(_slice.tile_shape())))
+              _xarray(std::make_shared<xt::xarray<T>>(xt::empty<T>(_slice.shape_of_rank())))
         {
         }
 
@@ -99,8 +98,7 @@ namespace x
             : _owner(owner),
               _slice(org._slice, slc),
               _lslice(to_xt(_slice.local_slice_of_rank())),
-              _xarray(org._xarray),
-              _issliced(true)
+              _xarray(org._xarray)
         {
             if(owner == NOOWNER && slice().size() <= 1) {
                 set_owner(org.slice().owner(slc));
@@ -114,15 +112,14 @@ namespace x
             : _owner(theTransceiver->rank()),
               _slice(std::forward<PVSlice>(slc)),
               _lslice(to_xt(_slice.slice())),
-              _xarray(),
-              _issliced(true)
+              _xarray()
         {
             _xarray = org;
         }
 
         bool is_sliced() const
         {
-            return _issliced;
+            return _slice.is_sliced();
         }
         
         virtual std::string __repr__() const
@@ -247,7 +244,7 @@ namespace x
 
         virtual void bufferize(const NDSlice & slc, Buffer & buff) const
         {
-            NDSlice lslice = NDSlice(slice().tile_shape()).slice(slc);
+            NDSlice lslice = NDSlice(slice().shape_of_rank()).slice(slc);
 
             std::cerr << "lslice=" << lslice << " slc= " << slc << " buffsz=" << buff.size() << " want " << slc.size()*sizeof(T) << std::endl;
 
diff --git a/test/test_spmd.py b/test/test_spmd.py

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ namespace x {`
`14`	`14`	`{`
`15`	`15`	`if constexpr (std::is_floating_point<T>::value) {`
`16`	`16`	`PVSlice pvslice(shp);`
`17`		`- shape_type shape(std::move(pvslice.tile_shape()));`
	`17`	`+ shape_type shape(std::move(pvslice.shape_of_rank()));`
`18`	`18`	`return operatorx<T>::mk_tx(std::move(pvslice), std::move(xt::random::rand(std::move(shape), to_native<T>(lower), to_native<T>(upper))));`
`19`	`19`	`}`
`20`	`20`	`}`
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`///`
`10`	`10`	`typedef std::vector<int64_t> NDIndex;`
`11`	`11`
	`12`	`+#if 0`
`12`	`13`	`///`
`13`	`14`	`/// @return tile-sizes for each dimension, as if leading dimensions were cut.`
`14`	`15`	`/// @param tile_shape tile-shape in question`
`@@ -42,3 +43,4 @@ uint64_t linearize(const std::vector<T> & idx, const std::vector<uint64_t> & tss`
`42`	`43`	`}`
`43`	`44`	`return tidx;`
`44`	`45`	`}`
	`46`	`+#endif`