introducing idtr::rebalance; fixing calls to imex::ptensor::create

fschlimb · fschlimb · commit 08a3baf6f3e6 · 2023-01-16T06:54:45.000-06:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -102,23 +102,26 @@ set(DDPTSrcs
     ${PROJECT_SOURCE_DIR}/src/SetGetItem.cpp
     ${PROJECT_SOURCE_DIR}/src/Sorting.cpp
 )
-set(IDTRSrcs
-    ${PROJECT_SOURCE_DIR}/src/idtr.cpp
+set(RTSrcs
     ${PROJECT_SOURCE_DIR}/src/CollComm.cpp
     ${PROJECT_SOURCE_DIR}/src/DDPTensorImpl.cpp
     ${PROJECT_SOURCE_DIR}/src/Deferred.cpp
     ${PROJECT_SOURCE_DIR}/src/Factory.cpp
     ${PROJECT_SOURCE_DIR}/src/Mediator.cpp
     ${PROJECT_SOURCE_DIR}/src/MPIMediator.cpp
-    ${PROJECT_SOURCE_DIR}/src/MPITransceiver.cpp
     ${PROJECT_SOURCE_DIR}/src/Registry.cpp
-    ${PROJECT_SOURCE_DIR}/src/Transceiver.cpp
     ${PROJECT_SOURCE_DIR}/src/jit/mlir.cpp
 )
+set(IDTRSrcs
+    ${PROJECT_SOURCE_DIR}/src/idtr.cpp
+    ${PROJECT_SOURCE_DIR}/src/MPITransceiver.cpp
+    ${PROJECT_SOURCE_DIR}/src/Transceiver.cpp
+)
 
 pybind11_add_module(_ddptensor MODULE ${DDPTSrcs} ${Hpps})
+add_library(_ddpt_rt SHARED ${RTSrcs} ${Hpps})
 add_library(idtr SHARED ${IDTRSrcs} ${Hpps})
-set(AllTargets _ddptensor idtr)
+set(AllTargets _ddptensor _ddpt_rt idtr)
 
 add_compile_definitions(USE_MKL=1)
 add_compile_options("-ftemplate-backtrace-limit=0")
@@ -144,17 +147,24 @@ get_property(imex_all_libs GLOBAL PROPERTY IMEX_ALL_LIBS)
 
 #llvm_update_compile_flags(_ddpttensor)
 target_link_directories(_ddptensor PRIVATE ${CONDA_PREFIX}/lib)
-target_link_directories(idtr PRIVATE ${CONDA_PREFIX}/lib ${IMEX_INSTALL_PREFIX}/lib)
+target_link_directories(_ddpt_rt PRIVATE ${CONDA_PREFIX}/lib) # ${IMEX_INSTALL_PREFIX}/lib)
+target_link_directories(idtr PRIVATE ${CONDA_PREFIX}/lib)
 
 target_link_libraries(_ddptensor PRIVATE
     # ${MKL_LIBRARIES}
     # tbb
+    _ddpt_rt
     idtr
 )
 target_link_libraries(idtr PRIVATE
     ${MPI_C_LIBRARIES}
     # ${MKL_LIBRARIES}
     tbb
+)
+target_link_libraries(_ddpt_rt PRIVATE
+    ${MPI_C_LIBRARIES}
+    # ${MKL_LIBRARIES}
+    tbb
     IMEXPTensorDialect
     IMEXPTensorTransforms
     IMEXPTensorToLinalg
diff --git a/src/Creator.cpp b/src/Creator.cpp
@@ -3,6 +3,7 @@
 #include "ddptensor/Deferred.hpp"
 #include "ddptensor/Factory.hpp"
 #include "ddptensor/DDPTensorImpl.hpp"
+#include "ddptensor/Transceiver.hpp"
 
 #include <imex/Dialect/PTensor/IR/PTensorOps.h>
 #include <imex/Utils/PassUtils.h>
@@ -153,11 +154,10 @@ struct DeferredFull : public Deferred
         ::imex::ptensor::DType dtyp;
         ::mlir::Value val = dispatch<ValAndDType>(_dtype, builder, loc, _val, dtyp);
 
-        auto dmy = ::imex::createInt<1>(loc, builder, 0);
         auto team = ::imex::createIndex(loc, builder, reinterpret_cast<uint64_t>(getTransceiver()));
 
         dm.addVal(this->guid(),
-                  builder.create<::imex::ptensor::CreateOp>(loc, shp, dtyp, val, dmy, team),
+                  builder.create<::imex::ptensor::CreateOp>(loc, shp, dtyp, val, nullptr, team),
                   [this](uint64_t rank, void *allocated, void *aligned, intptr_t offset, const intptr_t * sizes, const intptr_t * strides,
                          uint64_t * gs_allocated, uint64_t * gs_aligned, uint64_t * lo_allocated, uint64_t * lo_aligned) {
             assert(rank == this->_shape.size());
@@ -207,13 +207,10 @@ struct DeferredArange : public Deferred
         auto start = ::imex::createInt(loc, builder, _start);
         auto stop = ::imex::createInt(loc, builder, _end);
         auto step = ::imex::createInt(loc, builder, _step);
-        auto dtype = builder.getI64Type(); // FIXME
-        auto artype = ::imex::ptensor::PTensorType::get(builder.getContext(), 1, dtype, false);
-        auto dmy = ::imex::createInt<1>(loc, builder, 0);
         // ::mlir::Value 
         auto team = ::imex::createIndex(loc, builder, reinterpret_cast<uint64_t>(getTransceiver()));
         dm.addVal(this->guid(),
-                  builder.create<::imex::ptensor::ARangeOp>(loc, artype, start, stop, step, dmy, team),
+                  builder.create<::imex::ptensor::ARangeOp>(loc, start, stop, step, nullptr, team),
                   [this](uint64_t rank, void *allocated, void *aligned, intptr_t offset, const intptr_t * sizes, const intptr_t * strides,
                          uint64_t * gs_allocated, uint64_t * gs_aligned, uint64_t * lo_allocated, uint64_t * lo_aligned) {
             assert(rank == 1);
diff --git a/src/DDPTensorImpl.cpp b/src/DDPTensorImpl.cpp
@@ -5,6 +5,7 @@
 
 #include <ddptensor/DDPTensorImpl.hpp>
 #include <ddptensor/CppTypes.hpp>
+#include <ddptensor/Transceiver.hpp>
 
 #include <algorithm>
 
@@ -152,21 +153,6 @@ int64_t DDPTensorImpl::__int__() const
     return res;
 }
 
-void DDPTensorImpl::bufferize(const NDSlice & slc, Buffer & buff) const
-{
-    // FIXME slices/strides
-#if 0
-    if(slc.size() <= 0) return;
-    NDSlice lslice = NDSlice(slice().tile_shape()).slice(slc);
-#endif
-    assert(_strides[0] == 1);
-    auto pos = buff.size();
-    auto sz = size()*item_size();
-    buff.resize(pos + sz);
-    void * out = buff.data() + pos;
-    dispatch(_dtype, _aligned, [this, sz, out](auto * ptr) { memcpy(out, ptr + this->_offset, sz); });
-}
-
 void DDPTensorImpl::add_to_args(std::vector<void*> & args, int ndims)
 {
     assert(ndims == this->ndims());
@@ -180,24 +166,25 @@ void DDPTensorImpl::add_to_args(std::vector<void*> & args, int ndims)
     args.push_back(buff);
     // second the team
     args.push_back(reinterpret_cast<void*>(1));
-    if(ndims > 0)
-    // global shape third
-    buff = new intptr_t[dtensor_sz(1)];
-    buff[0] = reinterpret_cast<intptr_t>(_gs_allocated);
-    buff[1] = reinterpret_cast<intptr_t>(_gs_aligned);
-    buff[2] = 0;
-    buff[3] = ndims;
-    buff[4] = 1;
-    args.push_back(buff);
-    assert(5 == memref_sz(1));
-    // local offsets last
-    buff = new intptr_t[dtensor_sz(1)];
-    buff[0] = reinterpret_cast<intptr_t>(_lo_allocated);
-    buff[1] = reinterpret_cast<intptr_t>(_lo_aligned);
-    buff[2] = 0;
-    buff[3] = ndims;
-    buff[4] = 1;
-    args.push_back(buff);
+    if(ndims > 0) {
+        // global shape third
+        buff = new intptr_t[dtensor_sz(1)];
+        buff[0] = reinterpret_cast<intptr_t>(_gs_allocated);
+        buff[1] = reinterpret_cast<intptr_t>(_gs_aligned);
+        buff[2] = 0;
+        buff[3] = ndims;
+        buff[4] = 1;
+        args.push_back(buff);
+        assert(5 == memref_sz(1));
+        // local offsets last
+        buff = new intptr_t[dtensor_sz(1)];
+        buff[0] = reinterpret_cast<intptr_t>(_lo_allocated);
+        buff[1] = reinterpret_cast<intptr_t>(_lo_aligned);
+        buff[2] = 0;
+        buff[3] = ndims;
+        buff[4] = 1;
+        args.push_back(buff);
+    }
 }
 
 void DDPTensorImpl::replicate()
diff --git a/src/MPITransceiver.cpp b/src/MPITransceiver.cpp
@@ -168,23 +168,36 @@ void MPITransceiver::reduce_all(void * inout, DTypeId T, size_t N, RedOpType op)
 void MPITransceiver::alltoall(const void* buffer_send,
                               const int* counts_send,
                               const int* displacements_send,
-                              DTypeId datatype_send,
+                              DTypeId datatype,
                               void* buffer_recv,
                               const int* counts_recv,
-                              const int* displacements_recv,
-                              DTypeId datatype_recv)
+                              const int* displacements_recv)
 {
     MPI_Alltoallv(buffer_send,
                   counts_send,
                   displacements_send,
-                  to_mpi(datatype_send),
+                  to_mpi(datatype),
                   buffer_recv,
                   counts_recv,
                   displacements_recv,
-                  to_mpi(datatype_recv),
+                  to_mpi(datatype),
                   _comm);
 }
 
+void MPITransceiver::alltoall(const void* buffer_send,
+                              const int counts,
+                              DTypeId datatype,
+                              void* buffer_recv)
+{
+    MPI_Alltoall(buffer_send,
+                 counts,
+                 to_mpi(datatype),
+                 buffer_recv,
+                 counts,
+                 to_mpi(datatype),
+                 _comm);
+}
+
 void MPITransceiver::gather(void* buffer,
                             const int* counts,
                             const int* displacements,
diff --git a/src/SetGetItem.cpp b/src/SetGetItem.cpp
@@ -4,6 +4,7 @@
 #include "ddptensor/DDPTensorImpl.hpp"
 #include "ddptensor/Mediator.hpp"
 #include "ddptensor/Factory.hpp"
+#include "ddptensor/NDSlice.hpp"
 
 #include <imex/Dialect/PTensor/IR/PTensorOps.h>
 #include <imex/Dialect/Dist/IR/DistOps.h>
diff --git a/src/idtr.cpp b/src/idtr.cpp
@@ -1,14 +1,15 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include <ddptensor/idtr.hpp>
-#include <ddptensor/jit/mlir.hpp>
+// #include <ddptensor/jit/mlir.hpp>
 #include <ddptensor/DDPTensorImpl.hpp>
 #include <ddptensor/MPITransceiver.hpp>
 
-#include <imex/Dialect/PTensor/IR/PTensorOps.h>
+#include <imex/Dialect/PTensor/IR/PTensorDefs.h>
 
 #include <cassert>
 #include <memory>
+#include <iostream>
 
 using container_type = std::unordered_map<id_type, std::unique_ptr<DDPTensorImpl>>;
 
@@ -160,6 +161,51 @@ static DTypeId mlir2ddpt(const ::imex::ptensor::DType dt)
     };
 }
 
+
+template<typename T, typename OP>
+void forall(uint64_t d, const T * cptr, const int64_t * sizes, const int64_t * strides, uint64_t nd, OP op)
+{
+    auto stride = strides[d];
+    auto sz = sizes[d];
+    if(d==nd-1) {
+        for(auto i=0; i<sz; ++i) {
+            op(&cptr[i*stride]);
+        }
+    } else {
+        for(auto i=0; i<sz; ++i) {
+            forall(d+1, cptr, sizes, strides, nd, op);
+        }
+    }
+}
+
+bool is_contiguous(const int64_t * sizes, const int64_t * strides, uint64_t nd)
+{
+    if(nd == 0) return true;
+    if(strides[nd-1] != 1) return false;
+    auto sz = 1;
+    for(auto i=nd-1; i>0; --i) {
+        sz *= sizes[i];
+        if(strides[i-1] != sz) return false;
+    }
+    return true;
+}
+
+void * bufferize(void * cptr, DTypeId dtype, const int64_t * sizes, const int64_t * strides, uint64_t nd, void * out)
+{
+    if(is_contiguous(sizes, strides, nd)) {
+        return cptr;
+    } else {
+        dispatch(dtype, cptr, [sizes, strides, nd, out](auto * ptr) {
+            auto buff = static_cast<decltype(ptr)>(out);
+            forall(0, ptr, sizes, strides, nd, [&buff](const auto * in) {
+                *buff = *in;
+                ++buff;
+            });
+        });
+        return out;
+    }
+}
+
 extern "C" {
 // Elementwise inplace allreduce
 void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, ReduceOpId op)
@@ -168,12 +214,59 @@ void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, ReduceOpId op)
 }
 
 // FIXME hard-coded for contiguous layout
-void _idtr_reduce_all(uint64_t rank, void * data, int64_t * sizes, int64_t * strides, int dtype, int op)
+void _idtr_reduce_all(uint64_t rank, void * data, const int64_t * sizes, const int64_t * strides, int dtype, int op)
 {
     assert(rank == 0 || strides[rank-1] == 1);
     idtr_reduce_all(data,
                     mlir2ddpt(static_cast<::imex::ptensor::DType>(dtype)),
                     rank ? rank : 1,
                     mlir2ddpt(static_cast<imex::ptensor::ReduceOpId>(op)));
 }
+
+void _idtr_rebalance(uint64_t rank, const int64_t * gShape, const int64_t * lOffs,
+                     void * data, const int64_t * sizes, const int64_t * strides, int dtype,
+                     uint64_t outRank, void * out, const int64_t * outSizes, const int64_t * outStrides)
+{
+    assert(rank);
+    is_contiguous(outSizes, outStrides, outRank);
+    auto N = (int64_t)getTransceiver()->nranks();
+    auto myOff = lOffs[0];
+    auto mySz = sizes[0];
+    auto myEnd = myOff + mySz;
+    auto tSz = gShape[0];
+    auto sz = (tSz + N - 1) / N;
+    auto ddpttype = mlir2ddpt(static_cast<::imex::ptensor::DType>(dtype));
+    auto nSz = std::accumulate(&sizes[1], &sizes[rank], 1, std::multiplies<int64_t>());
+    std::vector<int> soffs(N);
+    std::vector<int> sszs(N, 0);
+    for(auto i=0; i<N; ++i) {
+        auto tOff = i * sz;
+        auto tEnd = std::min(tSz, tOff + sz);
+        if(tEnd > myOff && tOff < myEnd) {
+            // We have a target partition which is inside my local data
+            // we now compute what data goes to this target partition
+            auto start = std::max(myOff, tOff);
+            auto end = std::min(myEnd, tEnd);
+            soffs[i] = (int)(start - myOff) * nSz;
+            sszs[i] = (int)(end - start) * nSz;
+        } else {
+            soffs[i] = i ? soffs[i-1] + sszs[i-1] : 0;
+        }
+    }
+    // we now send our send sizes to others and receiver theirs
+    std::vector<int> rszs(N);
+    getTransceiver()->alltoall(sszs.data(), 1, INT32, rszs.data());
+    // For the actual alltoall we need the receive-displacements
+    std::vector<int> roffs(N);
+    roffs[0] = 0;
+    for(auto i=1; i<N; ++i) {
+        // compute for all i > 0
+        roffs[i] = roffs[i-1] + rszs[i-1];
+    }
+    // create send buffer (might be strided!)
+    Buffer buff(nSz * mySz * sizeof_dtype(ddpttype));
+    auto ptr = bufferize(data, ddpttype, sizes, strides, rank, buff.data());
+    // Finally communicate elements
+    getTransceiver()->alltoall(ptr, sszs.data(), soffs.data(), ddpttype, out, rszs.data(), roffs.data());
+}
 } // extern "C"
diff --git a/src/include/ddptensor/CollComm.hpp b/src/include/ddptensor/CollComm.hpp
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "CppTypes.hpp"
+#include "PVSlice.hpp"
 #include "DDPTensorImpl.hpp"
 
 struct CollComm
diff --git a/src/include/ddptensor/DDPTensorImpl.hpp b/src/include/ddptensor/DDPTensorImpl.hpp
@@ -5,7 +5,6 @@
 
 #pragma once
 
-#include "PVSlice.hpp"
 #include "p2c_ids.hpp"
 #include "tensor_i.hpp"
 #include "TypeDispatch.hpp"
@@ -146,8 +145,6 @@ class DDPTensorImpl : public tensor_i
         return sizeof_dtype(_dtype);
     }
 
-    virtual void bufferize(const NDSlice & slc, Buffer & buff) const;
-
     virtual void add_to_args(std::vector<void*> & args, int ndims);
 
     template<typename T>
diff --git a/src/include/ddptensor/MPITransceiver.hpp b/src/include/ddptensor/MPITransceiver.hpp
@@ -46,8 +46,11 @@ class MPITransceiver : public Transceiver
                           DTypeId datatype_send,
                           void* buffer_recv,
                           const int* counts_recv,
-                          const int* displacements_recv,
-                          DTypeId datatype_recv);
+                          const int* displacements_recv);
+    virtual void alltoall(const void* buffer_send,
+                          const int counts,
+                          DTypeId datatype,
+                          void* buffer_recv);
     virtual void gather(void* buffer,
                         const int* counts,
                         const int* displacements,
diff --git a/src/include/ddptensor/Transceiver.hpp b/src/include/ddptensor/Transceiver.hpp
diff --git a/src/include/ddptensor/TypeDispatch.hpp b/src/include/ddptensor/TypeDispatch.hpp
diff --git a/src/include/ddptensor/tensor_i.hpp b/src/include/ddptensor/tensor_i.hpp
diff --git a/test/test_ewb.py b/test/test_ewb.py