IntelPython
diff --git a/‎ddptensor/spmd.py‎
Lines changed: 3 additions & 0 deletions b/‎ddptensor/spmd.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/MPIMediator.cpp‎
Lines changed: 19 additions & 20 deletions b/‎src/MPIMediator.cpp‎
Lines changed: 19 additions & 20 deletions
diff --git a/‎src/MPITransceiver.cpp‎
Lines changed: 91 additions & 7 deletions b/‎src/MPITransceiver.cpp‎
Lines changed: 91 additions & 7 deletions
diff --git a/‎src/SetGetItem.cpp‎
Lines changed: 47 additions & 4 deletions b/‎src/SetGetItem.cpp‎
Lines changed: 47 additions & 4 deletions
diff --git a/‎src/ddptensor.cpp‎
Lines changed: 4 additions & 4 deletions b/‎src/ddptensor.cpp‎
Lines changed: 4 additions & 4 deletions
@@ -5,3 +5,6 @@ def get_slice(obj, *args):
 
 def get_local(obj):
     return  _cdt._get_local(obj._t, obj)
+
+def gather(obj):
+    return  _cdt._gather(obj._t)
@@ -29,7 +29,7 @@ def build_cmake(self, ext):
         extdir.parent.mkdir(parents=True, exist_ok=True)
 
         # example of cmake args
-        config = 'Debug' if self.debug else 'Release'  # 'RelWithDebInfo' #'Release'
+        config = 'Debug'# if self.debug else 'Release'  # 'RelWithDebInfo' #'Release'
         cmake_args = [
             '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + str(extdir.parent.absolute()),
             '-DCMAKE_BUILD_TYPE=' + config
 
@@ -8,6 +8,7 @@
 
 #include "ddptensor/UtilsAndTypes.hpp"
 #include "ddptensor/MPIMediator.hpp"
+#include "ddptensor/MPITransceiver.hpp"
 #include "ddptensor/NDSlice.hpp"
 #include "ddptensor/Factory.hpp"
 
@@ -18,29 +19,30 @@ constexpr static int DEFER_TAG = 14714;
 constexpr static int EXIT_TAG = 14715;
 static std::mutex ak_mutex;
 
-void send_to_workers(const Deferred::ptr_type & dfrd, bool self = false);
+void send_to_workers(const Deferred::ptr_type & dfrd, bool self, MPI_Comm comm);
 
 MPIMediator::MPIMediator()
     : _listener(nullptr)
 {
-    MPI_Comm comm = MPI_COMM_WORLD;
+    auto c = dynamic_cast<MPITransceiver*>(theTransceiver);
+    if(c == nullptr) throw std::runtime_error("Expected Transceiver to be MPITransceiver.");
+    _comm = c->comm();
     int sz;
-    MPI_Comm_size(comm, &sz);
+    MPI_Comm_size(_comm, &sz);
     if(sz > 1)
         _listener = new std::thread(&MPIMediator::listen, this);
 }
 
 MPIMediator::~MPIMediator()
 {
     std::cerr << "MPIMediator::~MPIMediator()" << std::endl;
-    MPI_Comm comm = MPI_COMM_WORLD;
     int rank, sz;
-    MPI_Comm_rank(comm, &rank);
-    MPI_Comm_size(comm, &sz);
+    MPI_Comm_rank(_comm, &rank);
+    MPI_Comm_size(_comm, &sz);
 
     if(is_cw() && rank == 0) to_workers(nullptr);
-    MPI_Barrier(comm);
-    if(!is_cw() || rank == 0) send_to_workers(nullptr, true);
+    MPI_Barrier(_comm);
+    if(!is_cw() || rank == 0) send_to_workers(nullptr, true, _comm);
     if(_listener) {
         _listener->join();
         delete _listener;
@@ -50,7 +52,6 @@ MPIMediator::~MPIMediator()
 
 void MPIMediator::pull(rank_type from, id_type guid, const NDSlice & slice, void * rbuff)
 {
-    MPI_Comm comm = MPI_COMM_WORLD;
     MPI_Request request[2];
     MPI_Status status[2];
     Buffer buff;
@@ -65,8 +66,8 @@ void MPIMediator::pull(rank_type from, id_type guid, const NDSlice & slice, void
     int cnt = static_cast<int>(ser.adapter().writtenBytesCount());
 
     auto sz = slice.size() * Registry::get(id).get()->item_size();
-    MPI_Irecv(rbuff, sz, MPI_CHAR, from, PUSH_TAG, comm, &request[1]);
-    MPI_Isend(buff.data(), cnt, MPI_CHAR, from, REQ_TAG, comm, &request[0]);
+    MPI_Irecv(rbuff, sz, MPI_CHAR, from, PUSH_TAG, _comm, &request[1]);
+    MPI_Isend(buff.data(), cnt, MPI_CHAR, from, REQ_TAG, _comm, &request[0]);
     auto error_code = MPI_Waitall(2, &request[0], &status[0]);
     if (error_code != MPI_SUCCESS) {
         throw std::runtime_error("MPI_Waitall returned error code " + std::to_string(error_code));
@@ -81,10 +82,9 @@ void MPIMediator::pull(rank_type from, id_type guid, const NDSlice & slice, void
     if(cnt != sz) throw(std::runtime_error("Received unexpected message size."));
 }
 
-void send_to_workers(const Deferred::ptr_type & dfrd, bool self)
+void send_to_workers(const Deferred::ptr_type & dfrd, bool self, MPI_Comm comm)
 {
     int rank, sz;
-    MPI_Comm comm = MPI_COMM_WORLD;
     MPI_Comm_rank(comm, &rank);
     MPI_Comm_size(comm, &sz);
 
@@ -126,22 +126,21 @@ void send_to_workers(const Deferred::ptr_type & dfrd, bool self)
 
 void MPIMediator::to_workers(const Deferred::ptr_type & dfrd)
 {
-    send_to_workers(dfrd);
+    send_to_workers(dfrd, false, _comm);
 }
 
 void MPIMediator::listen()
 {
     int nranks;
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+    MPI_Comm_size(_comm, &nranks);
     if(nranks < 2 ) return;
 
     constexpr int BSZ = 256;
-    MPI_Comm comm = MPI_COMM_WORLD;
     MPI_Request request_in = MPI_REQUEST_NULL, request_out = MPI_REQUEST_NULL;
     Buffer rbuff;
     // Issue async recv request
     Buffer buff(BSZ);
-    MPI_Irecv(buff.data(), buff.size(), MPI_CHAR, MPI_ANY_SOURCE, REQ_TAG, comm, &request_in);
+    MPI_Irecv(buff.data(), buff.size(), MPI_CHAR, MPI_ANY_SOURCE, REQ_TAG, _comm, &request_in);
     do {
         MPI_Status status;
         // Wait for any request
@@ -170,15 +169,15 @@ void MPIMediator::listen()
 
             // Issue async recv request for next msg
             buff.resize(BSZ);
-            MPI_Irecv(buff.data(), buff.size(), MPI_CHAR, MPI_ANY_SOURCE, REQ_TAG, comm, &request_in);
+            MPI_Irecv(buff.data(), buff.size(), MPI_CHAR, MPI_ANY_SOURCE, REQ_TAG, _comm, &request_in);
 
             // Now find the array in question and send back its bufferized slice
             tensor_i::ptr_type ptr = Registry::get(id).get();
             // Wait for previous answer to complete so that we can re-use the buffer
             MPI_Wait(&request_out, MPI_STATUS_IGNORE);
             ptr->bufferize(slice, rbuff);
             if(slice.size() * ptr->item_size() != rbuff.size()) throw(std::runtime_error("Got unexpected buffer size."));
-            MPI_Isend(rbuff.data(), rbuff.size(), MPI_CHAR, requester, PUSH_TAG, comm, &request_out);
+            MPI_Isend(rbuff.data(), rbuff.size(), MPI_CHAR, requester, PUSH_TAG, _comm, &request_out);
             break;
         }
         case EXIT_TAG:
@@ -190,7 +189,7 @@ void MPIMediator::listen()
         if(request_in == MPI_REQUEST_NULL) {
             // Issue async recv request for next msg
             buff.resize(BSZ);
-            MPI_Irecv(buff.data(), buff.size(), MPI_CHAR, MPI_ANY_SOURCE, REQ_TAG, comm, &request_in);
+            MPI_Irecv(buff.data(), buff.size(), MPI_CHAR, MPI_ANY_SOURCE, REQ_TAG, _comm, &request_in);
         }
     } while(true);
     // MPI_Cancel(&request_in);
 
@@ -2,9 +2,11 @@
 
 #include <mpi.h>
 #include <limits>
+#include <sstream>
 #include "ddptensor/MPITransceiver.hpp"
 
 MPITransceiver::MPITransceiver()
+    : _nranks(1), _rank(0), _comm(MPI_COMM_WORLD)
 {
     int flag;
     MPI_Initialized(&flag);
@@ -21,9 +23,81 @@ MPITransceiver::MPITransceiver()
             throw(std::logic_error("MPI had been initialized incorrectly: not MPI_THREAD_MULTIPLE"));
         std::cerr << "MPI already initialized\n";
     }
+
     int nranks, rank;
-    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_rank(_comm, &rank);
+    MPI_Comm parentComm;
+    MPI_Comm_get_parent(&parentComm);
+
+    // rank father-of-all checks if he's requested to spawn processes:
+    if(rank == 0 && parentComm == MPI_COMM_NULL) {
+        // Ok, let's spawn the clients.
+        // I need some information for the startup.
+        // 1. Name of the executable (default is the current exe)
+        const char * _tmp = getenv("DDPT_MPI_SPAWN");
+        if(_tmp) {
+            int nClientsToSpawn = atol(_tmp);
+            _tmp = getenv("DDPT_MPI_EXECUTABLE");
+            std::string clientExe(_tmp ? _tmp : getenv("PYTHON_EXE"));
+            if(clientExe.empty()) throw std::runtime_error("Spawning MPI processes requires setting 'DDPT_MPI_EXECUTABLE' or 'PYTHON_EXE'");
+
+            // 2. arguments
+            _tmp = getenv("DDPT_MPI_EXE_ARGS");
+            std::vector<std::string> args;
+            if(_tmp) {
+                std::istringstream iss(_tmp);
+                std::copy(std::istream_iterator<std::string>(iss),
+                          std::istream_iterator<std::string>(),
+                          std::back_inserter(args));
+            } else {
+                _tmp = "-c import ddptensor as dt; dt.init(True)";
+                args.push_back("-c");
+                args.push_back("import ddptensor as dt; dt.init(True)");
+            }
+            const char * clientArgs[args.size()+1];
+            for(int i=0; i<args.size(); ++i) clientArgs[i] = args[i].c_str();
+            clientArgs[args.size()] = nullptr;
+
+            // 3. Special setting for MPI_Info: hosts
+            const char * clientHost = getenv("DDPT_MPI_HOSTS");
+
+            // Prepare MPI_Info object:
+            MPI_Info clientInfo = MPI_INFO_NULL;
+            if(clientHost) {
+                MPI_Info_create(&clientInfo);
+                MPI_Info_set(clientInfo, const_cast< char * >("host"), const_cast< char * >(clientHost));
+                std::cerr << "[DDPT " << rank << "] Set MPI_Info_set(\"host\", \"" << clientHost << "\")\n";
+            }
+            // Now spawn the client processes:
+            // can't use Speaker yet, need Channels to be inited
+            std::cerr << "[DDPT " << rank << "] Spawning " << nClientsToSpawn << " MPI processes ("
+                      << clientExe << " "  << _tmp << ")" << std::endl;
+            int* errCodes = new int[nClientsToSpawn];
+            MPI_Comm interComm;
+            int err = MPI_Comm_spawn(const_cast< char * >(clientExe.c_str()),
+                                     const_cast< char ** >(clientArgs),
+                                     nClientsToSpawn, clientInfo, 0,
+                                     MPI_COMM_WORLD, &interComm, errCodes);
+            delete [] errCodes;
+            if (err) {
+                // can't use Speaker yet, need Channels to be inited
+                std::cerr << "[DDPT " << rank << "] Error in MPI_Comm_spawn. Skipping process spawning";
+            } else {
+                MPI_Intercomm_merge(interComm, 0, &_comm);
+            }
+        } // else {
+        // No process spawning
+        // MPI-1 situation: all clients to be started by mpiexec
+        //                    _comm = MPI_COMM_WORLD;
+        //}
+    }
+    if(parentComm != MPI_COMM_NULL) {
+        // I am a child. Build intra-comm to the parent.
+        MPI_Intercomm_merge(parentComm, 1, &_comm);
+    }
+
+    MPI_Comm_size(_comm, &nranks);
+    MPI_Comm_rank(_comm, &rank);
     _nranks = nranks;
     _rank = rank;
 };
@@ -73,17 +147,17 @@ static MPI_Op to_mpi(RedOpType o)
 
 void MPITransceiver::barrier()
 {
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(_comm);
 }
 
 void MPITransceiver::bcast(void * ptr, size_t N, rank_type root)
 {
-    MPI_Bcast(ptr, N, MPI_CHAR, root, MPI_COMM_WORLD);
+    MPI_Bcast(ptr, N, MPI_CHAR, root, _comm);
 }
 
 void MPITransceiver::reduce_all(void * inout, DTypeId T, size_t N, RedOpType op)
 {
-    MPI_Allreduce(MPI_IN_PLACE, inout, N, to_mpi(T), to_mpi(op), MPI_COMM_WORLD);
+    MPI_Allreduce(MPI_IN_PLACE, inout, N, to_mpi(T), to_mpi(op), _comm);
 }
 
 void MPITransceiver::alltoall(const void* buffer_send,
@@ -103,7 +177,17 @@ void MPITransceiver::alltoall(const void* buffer_send,
                   counts_recv,
                   displacements_recv,
                   to_mpi(datatype_recv),
-                  MPI_COMM_WORLD);
+                  _comm);
+}
+
+void MPITransceiver::allgather(void* buffer,
+                               const int* counts,
+                               const int* displacements,
+                               DTypeId datatype)
+{
+    MPI_Allgatherv(MPI_IN_PLACE, 0, to_mpi(datatype),
+                   buffer, counts, displacements, to_mpi(datatype),
+                   _comm);
 }
 
 void MPITransceiver::send_recv(void* buffer_send,
@@ -120,6 +204,6 @@ void MPITransceiver::send_recv(void* buffer_send,
                          SRTAG,
                          source,
                          SRTAG,
-                         MPI_COMM_WORLD,
+                         _comm,
                          MPI_STATUS_IGNORE);
 }
@@ -14,11 +14,11 @@ namespace x {
         template<typename T>
         static ptr_type op(const NDSlice & slice, const std::shared_ptr<DPTensorX<T>> & a_ptr)
         {
-            auto nd = a_ptr->shape().size();
-            if(nd != slice.ndims())
+            const auto & slc = a_ptr->slice();
+            if(slc.ndims() != slice.ndims())
                 throw std::runtime_error("Index dimensionality must match array dimensionality");
 
-            return operatorx<T>::mk_tx(*a_ptr.get(), slice);
+            return operatorx<T>::mk_tx(*a_ptr.get(), slice.trim(slc.slice()));
         }
     };
 
@@ -50,7 +50,6 @@ namespace x {
                 NDSlice my_curr_local_slice = my_curr_needed_view.local_slice_of_rank(theTransceiver->rank());
 
                 if(curr_needed_norm_slice.size()) {
-                    py::tuple tpl = _make_tuple(my_curr_local_slice); //my_curr_view.slice());
                     if(i == theTransceiver->rank()) {
                         // copy locally
                         auto to_v   = xt::strided_view(dest/*.xarray()*/, to_xt(my_curr_local_slice));
@@ -125,6 +124,44 @@ namespace x {
             T * data = a_ptr->xarray().data();
             return py::array(std::move(slc.shape()), std::move(strides), data + off, handle);
         }
+
+        // gather
+        // We simply create a local buffer, copy our local data to the right place
+        // and then call AllGatherV via inplace operation.
+        template<typename T>
+        static py::object op(const std::shared_ptr<DPTensorX<T>> & a_ptr)
+        {
+            auto nranks = theTransceiver->nranks();
+            auto rank = theTransceiver->rank();
+            const auto & slc = a_ptr->slice();
+
+            // create buffer/numpy array
+            auto res = py::array_t<T>(std::move(slc.shape()));
+            T * ptr = reinterpret_cast<T*>(res.mutable_data());
+            int displacements[nranks];
+            int counts[nranks];
+            int off = 0;
+            // for each rank compute counts and displacements
+            for(auto i=0; i<nranks; ++i) {
+                uint64_t szi = slc.slice_of_rank(i).size();
+                counts[i] = szi;
+                displacements[i] = off;
+                // copy our local data
+                if(i == rank) {
+                    if(a_ptr->is_sliced()) {
+                        // if non-contiguous copy element by element
+                        const auto & av = xt::strided_view(a_ptr->xarray(), a_ptr->lslice());
+                        uint64_t i = off-1;
+                        for(auto v : av) ptr[++i] = v;
+                    } else {
+                        memcpy(&ptr[off], a_ptr->xarray().data(), szi*sizeof(T));
+                    }
+                }
+                off += szi;
+            }
+            theTransceiver->allgather(ptr, counts, displacements, DTYPE<T>::value);
+            return res;
+        }
     };
 
 } // namespace x
@@ -212,5 +249,11 @@ py::object GetItem::get_local(const ddptensor & a, py::handle h)
     return TypeDispatch<x::SPMD>(aa, h);
 }
 
+py::object GetItem::gather(const ddptensor & a)
+{
+    const auto aa = std::move(a.get().get());
+    return TypeDispatch<x::SPMD>(aa);
+}
+
 FACTORY_INIT(DeferredGetItem, F_GETITEM);
 FACTORY_INIT(DeferredSetItem, F_SETITEM);
@@ -83,6 +83,8 @@ void fini()
 void init(bool cw)
 {
     if(inited) return;
+    theTransceiver = new MPITransceiver();
+    theMediator = new MPIMediator();
     if(cw) {
         _is_cw = true;
         if(theTransceiver->rank()) {
@@ -114,9 +116,6 @@ PYBIND11_MODULE(_ddptensor, m) {
     Factory::init<F_RANDOM>();
     Factory::init<F_SERVICE>();
 
-    theTransceiver = new MPITransceiver();
-    theMediator = new MPIMediator();
-
     m.doc() = "A partitioned and distributed tensor";
 
     def_enums(m);
@@ -126,7 +125,8 @@ PYBIND11_MODULE(_ddptensor, m) {
         .def("sync", &sync)
         .def("myrank", &myrank)
         .def("_get_slice", &GetItem::get_slice)
-        .def("_get_local", &GetItem::get_local);
+        .def("_get_local", &GetItem::get_local)
+        .def("_gather", &GetItem::gather);
 
     py::class_<Creator>(m, "Creator")
         .def("create_from_shape", &Creator::create_from_shape)