IntelPython
diff --git a/‎ddptensor/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎ddptensor/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ddptensor/spmd.py‎
Lines changed: 2 additions & 2 deletions b/‎ddptensor/spmd.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/Deferred.cpp‎
Lines changed: 27 additions & 21 deletions b/‎src/Deferred.cpp‎
Lines changed: 27 additions & 21 deletions
diff --git a/‎src/IO.cpp‎
Lines changed: 44 additions & 0 deletions b/‎src/IO.cpp‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎src/MPIMediator.cpp‎
Lines changed: 6 additions & 5 deletions b/‎src/MPIMediator.cpp‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/MPITransceiver.cpp‎
Lines changed: 20 additions & 7 deletions b/‎src/MPITransceiver.cpp‎
Lines changed: 20 additions & 7 deletions
diff --git a/‎src/Random.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/Random.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/SetGetItem.cpp‎
Lines changed: 28 additions & 13 deletions b/‎src/SetGetItem.cpp‎
Lines changed: 28 additions & 13 deletions
diff --git a/‎src/ddptensor.cpp‎
Lines changed: 7 additions & 2 deletions b/‎src/ddptensor.cpp‎
Lines changed: 7 additions & 2 deletions
@@ -43,6 +43,9 @@ def init(cw=None):
     cw = _ddpt_cw if cw is None else cw
     _init(cw)
 
+def to_numpy(a):
+    return _cdt.to_numpy(a._t)
+
 for op in api.api_categories["EWBinOp"]:
     if not op.startswith("__"):
         OP = op.upper()
 
@@ -6,5 +6,5 @@ def get_slice(obj, *args):
 def get_local(obj):
     return  _cdt._get_local(obj._t, obj)
 
-def gather(obj):
-    return  _cdt._gather(obj._t)
+def gather(obj, root=_cdt._Ranks._REPLICATED):
+    return  _cdt._gather(obj._t, root)
@@ -4,51 +4,57 @@
 #include "include/ddptensor/Mediator.hpp"
 #include "include/ddptensor/Registry.hpp"
 
-static tbb::concurrent_bounded_queue<Deferred::ptr_type> _deferred;
+static tbb::concurrent_bounded_queue<Runable::ptr_type> _deferred;
 
-Deferred::future_type Deferred::get_future()
+void push_runable(Runable::ptr_type && r)
 {
-    return {std::move(tensor_i::promise_type::get_future()), _guid};
+    _deferred.push(std::move(r));
 }
 
-#if 0
-void Deferred::set_value(tensor_i::ptr_type && v)
+void _dist(const Runable * p)
 {
-    if(_guid != Registry::NOGUID) {
-        Registry::put(_guid, v);
-    }
-    tensor_i::promise_type::set_value(std::forward<tensor_i::ptr_type>(v));
+    if(is_cw() && theTransceiver->rank() == 0)
+        theMediator->to_workers(p);
 }
-#endif
 
-Deferred::future_type Deferred::defer(Deferred::ptr_type && d, bool is_global)
+Deferred::future_type Deferred::get_future()
 {
+    return {std::move(tensor_i::promise_type::get_future().share()), _guid};
+}
+
+Deferred::future_type defer_tensor(Runable::ptr_type && _d, bool is_global)
+{
+    Deferred * d = dynamic_cast<Deferred*>(_d.get());
+    if(!d) throw std::runtime_error("Expected Deferred Tensor promise");
     if(is_global) {
-        if(is_cw() && theTransceiver->rank() == 0) theMediator->to_workers(d);
-        if(d) d->_guid = Registry::get_guid();
+        _dist(d);
+        d->_guid = Registry::get_guid();
     }
-    auto f = d ? d->get_future() : Deferred::future_type();
+    auto f = d->get_future();
     Registry::put(f);
-    _deferred.push(std::move(d));
+    push_runable(std::move(_d));
     return f;
 }
 
-Deferred::ptr_type Deferred::undefer_next()
+void Deferred::defer(Runable::ptr_type && p)
+{
+    defer_tensor(std::move(p), true);
+}
+
+void Runable::defer(Runable::ptr_type && p)
 {
-    Deferred::ptr_type r;
-    _deferred.pop(r);
-    return r;
+    push_runable(std::move(p));
 }
 
-void Deferred::fini()
+void Runable::fini()
 {
     _deferred.clear();
 }
 
 void process_promises()
 {
     while(true) {
-        Deferred::ptr_type d;
+        Runable::ptr_type d;
         _deferred.pop(d);
         if(d) d->run();
         else break;
 
@@ -0,0 +1,44 @@
+#include "ddptensor/IO.hpp"
+#include "ddptensor/SetGetItem.hpp"
+#include "ddptensor/TypeDispatch.hpp"
+#include "ddptensor/Factory.hpp"
+
+using promise_type = std::promise<py::object>;
+using future_type = std::shared_future<py::object>;
+
+struct DeferredToNumpy : public DeferredT<promise_type, future_type>
+{
+    id_type _a;
+
+    DeferredToNumpy() = default;
+    DeferredToNumpy(const tensor_i::future_type & a)
+        : _a(a.id())
+    {}
+
+    void run()
+    {
+        const auto a = std::move(Registry::get(_a).get());
+        set_value(GetItem::do_gather(a, is_cw() ? 0 : REPLICATED));
+    }
+
+    FactoryId factory() const
+    {
+        return F_TONUMPY;
+    }
+
+    template<typename S>
+    void serialize(S & ser)
+    {
+        ser.template value<sizeof(_a)>(_a);
+    }
+};
+
+py::object IO::to_numpy(const ddptensor & a)
+{
+    assert(!is_cw() || theTransceiver->rank() == 0);
+    auto f = defer<DeferredToNumpy>(a.get());
+    auto x = f.get();
+    return x;
+}
+
+FACTORY_INIT(DeferredToNumpy, F_TONUMPY);
@@ -19,7 +19,7 @@ constexpr static int DEFER_TAG = 14714;
 constexpr static int EXIT_TAG = 14715;
 static std::mutex ak_mutex;
 
-void send_to_workers(const Deferred::ptr_type & dfrd, bool self, MPI_Comm comm);
+void send_to_workers(const Runable * dfrd, bool self, MPI_Comm comm);
 
 MPIMediator::MPIMediator()
     : _listener(nullptr)
@@ -82,7 +82,7 @@ void MPIMediator::pull(rank_type from, id_type guid, const NDSlice & slice, void
     if(cnt != sz) throw(std::runtime_error("Received unexpected message size."));
 }
 
-void send_to_workers(const Deferred::ptr_type & dfrd, bool self, MPI_Comm comm)
+void send_to_workers(const Runable * dfrd, bool self, MPI_Comm comm)
 {
     int rank, sz;
     MPI_Comm_rank(comm, &rank);
@@ -124,7 +124,7 @@ void send_to_workers(const Deferred::ptr_type & dfrd, bool self, MPI_Comm comm)
     }
 }
 
-void MPIMediator::to_workers(const Deferred::ptr_type & dfrd)
+void MPIMediator::to_workers(const Runable * dfrd)
 {
     send_to_workers(dfrd, false, _comm);
 }
@@ -158,7 +158,8 @@ void MPIMediator::listen()
         case DEFER_TAG: {
             FactoryId fctryid;
             ser.value<sizeof(fctryid)>(fctryid);
-            Deferred::defer(Factory::get(fctryid)->create(ser), true);
+            auto uptr = Factory::get(fctryid)->create(ser);
+            uptr.get()->defer(std::move(uptr)); // grmpf
             break;
         }
         case PULL_TAG: {
@@ -181,7 +182,7 @@ void MPIMediator::listen()
             break;
         }
         case EXIT_TAG:
-            Deferred::defer(nullptr, false);
+            defer(nullptr);
             return;
         default:
             throw(std::runtime_error("Received unexpected message tag."));
 
@@ -180,14 +180,27 @@ void MPITransceiver::alltoall(const void* buffer_send,
                   _comm);
 }
 
-void MPITransceiver::allgather(void* buffer,
-                               const int* counts,
-                               const int* displacements,
-                               DTypeId datatype)
+void MPITransceiver::gather(void* buffer,
+                            const int* counts,
+                            const int* displacements,
+                            DTypeId datatype,
+                            rank_type root)
 {
-    MPI_Allgatherv(MPI_IN_PLACE, 0, to_mpi(datatype),
-                   buffer, counts, displacements, to_mpi(datatype),
-                   _comm);
+    if(root == REPLICATED) {
+        MPI_Allgatherv(MPI_IN_PLACE, 0, to_mpi(datatype),
+                       buffer, counts, displacements, to_mpi(datatype),
+                       _comm);
+    } else {
+        if(root == _rank) {
+            MPI_Gatherv(MPI_IN_PLACE, 0, to_mpi(datatype),
+                        buffer, counts, displacements, to_mpi(datatype),
+                        root, _comm);
+        } else {
+            MPI_Gatherv(buffer, counts[_rank], to_mpi(datatype),
+                        nullptr, nullptr, nullptr, to_mpi(datatype),
+                        root, _comm);
+        }
+    }
 }
 
 void MPITransceiver::send_recv(void* buffer_send,
 
@@ -69,7 +69,7 @@ ddptensor * Random::rand(DTypeId dtype, const shape_type & shape, const py::obje
 
 void Random::seed(uint64_t s)
 {
-    defer([s](){xt::random::seed(s); return tensor_i::ptr_type();});
+    defer_lambda([s](){xt::random::seed(s); return tensor_i::ptr_type();});
 }
 
 FACTORY_INIT(DeferredRandomOp, F_RANDOM);
@@ -77,7 +77,7 @@ namespace x {
             PVSlice g_slc_view(a_ptr->slice(), slice);
             PVSlice my_rel_slice(g_slc_view, theTransceiver->rank());
             NDSlice my_norm_slice = g_slc_view.map_slice(my_rel_slice.slice_of_rank()); //slice());my_slice);
-            
+
             if(is_spmd()) theTransceiver->barrier();
             _set_slice<A>(a_ptr->xarray(), my_rel_slice, b_ptr, my_norm_slice, val_guid);
             theTransceiver->barrier();
@@ -129,37 +129,47 @@ namespace x {
         // We simply create a local buffer, copy our local data to the right place
         // and then call AllGatherV via inplace operation.
         template<typename T>
-        static py::object op(const std::shared_ptr<DPTensorX<T>> & a_ptr)
+        static py::object op(rank_type root, const std::shared_ptr<DPTensorX<T>> & a_ptr)
         {
             auto nranks = theTransceiver->nranks();
             auto rank = theTransceiver->rank();
+            bool sendonly = root != REPLICATED && root != rank;
             const auto & slc = a_ptr->slice();
+            auto mysz = slc.slice_of_rank().size();
 
             // create buffer/numpy array
-            auto res = py::array_t<T>(std::move(slc.shape()));
-            T * ptr = reinterpret_cast<T*>(res.mutable_data());
+            T * ptr = nullptr;
+            py::array res;
+            if(sendonly) {
+                if(mysz > 0 && a_ptr->is_sliced()) ptr = new T[mysz];
+            } else {
+                res = py::array_t<T>(std::move(slc.shape()));
+                ptr = reinterpret_cast<T*>(res.mutable_data());
+            }
             int displacements[nranks];
             int counts[nranks];
             int off = 0;
             // for each rank compute counts and displacements
             for(auto i=0; i<nranks; ++i) {
-                uint64_t szi = slc.slice_of_rank(i).size();
+                uint64_t szi = i == rank ? mysz : slc.slice_of_rank(i).size();
                 counts[i] = szi;
                 displacements[i] = off;
                 // copy our local data
                 if(i == rank) {
                     if(a_ptr->is_sliced()) {
                         // if non-contiguous copy element by element
                         const auto & av = xt::strided_view(a_ptr->xarray(), a_ptr->lslice());
-                        uint64_t i = off-1;
-                        for(auto v : av) ptr[++i] = v;
+                        uint64_t j = sendonly ? -1 : off - 1;
+                        for(auto v : av) ptr[++j] = v;
                     } else {
-                        memcpy(&ptr[off], a_ptr->xarray().data(), szi*sizeof(T));
+                        if(sendonly && mysz > 0) ptr = a_ptr->xarray().data();
+                        else memcpy(&ptr[off], a_ptr->xarray().data(), szi*sizeof(T));
                     }
                 }
                 off += szi;
             }
-            theTransceiver->allgather(ptr, counts, displacements, DTYPE<T>::value);
+            theTransceiver->gather(ptr, counts, displacements, DTYPE<T>::value, root);
+            if(sendonly && mysz > 0 && a_ptr->is_sliced()) delete [] ptr;
             return res;
         }
     };
@@ -171,12 +181,12 @@ struct DeferredSetItem : public Deferred
     id_type _a;
     id_type _b;
     NDSlice _slc;
-    
+
     DeferredSetItem() = default;
     DeferredSetItem(const tensor_i::future_type & a, const tensor_i::future_type & b, const std::vector<py::slice> & v)
         : _a(a.id()), _b(b.id()), _slc(v)
     {}
-    
+
     void run()
     {
         const auto a = std::move(Registry::get(_a).get());
@@ -249,10 +259,15 @@ py::object GetItem::get_local(const ddptensor & a, py::handle h)
     return TypeDispatch<x::SPMD>(aa, h);
 }
 
-py::object GetItem::gather(const ddptensor & a)
+py::object GetItem::do_gather(const tensor_i::ptr_type & a, rank_type root)
+{
+    return TypeDispatch<x::SPMD>(a, root);
+}
+
+py::object GetItem::gather(const ddptensor & a, rank_type root)
 {
     const auto aa = std::move(a.get().get());
-    return TypeDispatch<x::SPMD>(aa);
+    return do_gather(aa, root);
 }
 
 FACTORY_INIT(DeferredGetItem, F_GETITEM);
 
@@ -34,6 +34,7 @@ using namespace pybind11::literals; // to bring _a
 #include "ddptensor/LinAlgOp.hpp"
 #include "ddptensor/Service.hpp"
 #include "ddptensor/Factory.hpp"
+#include "ddptensor/IO.hpp"
 
 // #########################################################################
 // The following classes are wrappers bridging pybind11 defs to TypeDispatch
@@ -68,7 +69,7 @@ void fini()
     delete theMediator;  // stop task is sent in here
     theMediator = nullptr;
     if(pprocessor) {
-        if(theTransceiver->nranks() == 1) Deferred::defer(nullptr, false);
+        if(theTransceiver->nranks() == 1) defer(nullptr);
         pprocessor->join();
         delete pprocessor;
     }
@@ -115,18 +116,22 @@ PYBIND11_MODULE(_ddptensor, m) {
     Factory::init<F_SETITEM>();
     Factory::init<F_RANDOM>();
     Factory::init<F_SERVICE>();
+    Factory::init<F_TONUMPY>();
 
     m.doc() = "A partitioned and distributed tensor";
 
     def_enums(m);
+    py::enum_<_RANKS>(m, "_Ranks")
+        .value("_REPLICATED", REPLICATED);
 
     m.def("fini", &fini)
         .def("init", &init)
         .def("sync", &sync)
         .def("myrank", &myrank)
         .def("_get_slice", &GetItem::get_slice)
         .def("_get_local", &GetItem::get_local)
-        .def("_gather", &GetItem::gather);
+        .def("_gather", &GetItem::gather)
+        .def("to_numpy", &IO::to_numpy);
 
     py::class_<Creator>(m, "Creator")
         .def("create_from_shape", &Creator::create_from_shape)
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ constexpr static int DEFER_TAG = 14714;`
`19`	`19`	`constexpr static int EXIT_TAG = 14715;`
`20`	`20`	`static std::mutex ak_mutex;`
`21`	`21`
`22`		`-void send_to_workers(const Deferred::ptr_type & dfrd, bool self, MPI_Comm comm);`
	`22`	`+void send_to_workers(const Runable * dfrd, bool self, MPI_Comm comm);`
`23`	`23`
`24`	`24`	`MPIMediator::MPIMediator()`
`25`	`25`	`: _listener(nullptr)`
`@@ -82,7 +82,7 @@ void MPIMediator::pull(rank_type from, id_type guid, const NDSlice & slice, void`
`82`	`82`	`if(cnt != sz) throw(std::runtime_error("Received unexpected message size."));`
`83`	`83`	`}`
`84`	`84`
`85`		`-void send_to_workers(const Deferred::ptr_type & dfrd, bool self, MPI_Comm comm)`
	`85`	`+void send_to_workers(const Runable * dfrd, bool self, MPI_Comm comm)`
`86`	`86`	`{`
`87`	`87`	`int rank, sz;`
`88`	`88`	`MPI_Comm_rank(comm, &rank);`
`@@ -124,7 +124,7 @@ void send_to_workers(const Deferred::ptr_type & dfrd, bool self, MPI_Comm comm)`
`124`	`124`	`}`
`125`	`125`	`}`
`126`	`126`
`127`		`-void MPIMediator::to_workers(const Deferred::ptr_type & dfrd)`
	`127`	`+void MPIMediator::to_workers(const Runable * dfrd)`
`128`	`128`	`{`
`129`	`129`	`send_to_workers(dfrd, false, _comm);`
`130`	`130`	`}`
`@@ -158,7 +158,8 @@ void MPIMediator::listen()`
`158`	`158`	`case DEFER_TAG: {`
`159`	`159`	`FactoryId fctryid;`
`160`	`160`	`ser.value<sizeof(fctryid)>(fctryid);`
`161`		`- Deferred::defer(Factory::get(fctryid)->create(ser), true);`
	`161`	`+ auto uptr = Factory::get(fctryid)->create(ser);`
	`162`	`+ uptr.get()->defer(std::move(uptr)); // grmpf`
`162`	`163`	`break;`
`163`	`164`	`}`
`164`	`165`	`case PULL_TAG: {`
`@@ -181,7 +182,7 @@ void MPIMediator::listen()`
`181`	`182`	`break;`
`182`	`183`	`}`
`183`	`184`	`case EXIT_TAG:`
`184`		`- Deferred::defer(nullptr, false);`
	`185`	`+ defer(nullptr);`
`185`	`186`	`return;`
`186`	`187`	`default:`
`187`	`188`	`throw(std::runtime_error("Received unexpected message tag."));`
Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ ddptensor * Random::rand(DTypeId dtype, const shape_type & shape, const py::obje`
`69`	`69`
`70`	`70`	`void Random::seed(uint64_t s)`
`71`	`71`	`{`
`72`		`- defer([s](){xt::random::seed(s); return tensor_i::ptr_type();});`
	`72`	`+ defer_lambda([s](){xt::random::seed(s); return tensor_i::ptr_type();});`
`73`	`73`	`}`
`74`	`74`
`75`	`75`	`FACTORY_INIT(DeferredRandomOp, F_RANDOM);`