async execution in speparate thread

fschlimb · fschlimb · commit ee0bb9e047cf · 2022-03-11T05:04:53.000-06:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,7 +20,7 @@ find_package(pybind11 CONFIG)
 find_package(MPI REQUIRED)
 #find_package(OpenMP)
 
-set(MKL_LIBRARIES -L$ENV{MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lrt -ldl -lm)
+set(MKL_LIBRARIES -L$ENV{MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -ltbb -lpthread -lrt -ldl -lm)
 #set(CMAKE_INSTALL_RPATH $ENV{MKLROOT}/lib)
 # Use -fPIC even if statically compiled
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
diff --git a/ddptensor/__init__.py b/ddptensor/__init__.py
@@ -25,7 +25,8 @@
     UINT32 as uint32,
     UINT16 as uint16,
     UINT8 as uint8,
-    fini
+    fini,
+    sync
 )
 from .ddptensor import dtensor
 from os import getenv
diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@ def build_cmake(self, ext):
         extdir.parent.mkdir(parents=True, exist_ok=True)
 
         # example of cmake args
-        config = 'Debug'# if self.debug else 'Release'
+        config = 'Debug' if self.debug else 'Release' #'RelWithDebInfo'
         cmake_args = [
             '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + str(extdir.parent.absolute()),
             '-DCMAKE_BUILD_TYPE=' + config
diff --git a/src/Creator.cpp b/src/Creator.cpp
@@ -43,7 +43,7 @@ namespace x {
 
         static ptr_type op(uint64_t start, uint64_t end, uint64_t step)
         {
-            PVSlice pvslice({Slice(start, end, step).size()});
+            PVSlice pvslice({static_cast<uint64_t>(Slice(start, end, step).size())});
             auto lslc = pvslice.slice_of_rank();
             const auto & l1dslc = lslc.dim(0);
             auto a = xt::arange<T>(start + l1dslc._start*step, start + l1dslc._end * step, l1dslc._step);
diff --git a/src/Deferred.cpp b/src/Deferred.cpp
@@ -1,21 +1,40 @@
 #include "include/ddptensor/Deferred.hpp"
-#include <queue>
+#include <oneapi/tbb/concurrent_queue.h>
 
-static std::queue<Deferred::ptr_type> _deferred;
+static tbb::concurrent_bounded_queue<Deferred::ptr_type> _deferred;
 
 Deferred::future_type Deferred::defer(Deferred::ptr_type && d)
 {
-    //auto f = d->get_future();
+    auto f = d ? d->get_future() : Deferred::future_type();
     _deferred.push(std::move(d));
-    // return f;
-    auto aa = Deferred::undefer_next();
+    return f;
+    /* auto aa = Deferred::undefer_next();
     aa->run();
-    return aa->get_future();
+    return aa->get_future(); */
 }
 
 Deferred::ptr_type Deferred::undefer_next()
 {
-    auto r = std::move(_deferred.front());
-    _deferred.pop();
+    Deferred::ptr_type r;
+    _deferred.pop(r);
     return r;
 }
+
+void process_promises()
+{
+    while(true) {
+        Deferred::ptr_type d;
+        _deferred.pop(d);
+        // auto d = std::move(Deferred::undefer_next());
+        if(d) d->run();
+        else break;
+        d.reset();
+    }
+}
+
+void sync()
+{
+    while(!_deferred.empty()) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+}
diff --git a/src/MPIMediator.cpp b/src/MPIMediator.cpp
@@ -53,8 +53,9 @@ uint64_t MPIMediator::register_array(tensor_i::ptr_type ary)
     return s_last_id;
 }
 
-uint64_t MPIMediator::unregister_array(uint64_t id)
+void MPIMediator::unregister_array(uint64_t id)
 {
+    locker _l(ak_mutex);
     s_ak.erase(id);
 }
 
@@ -72,7 +73,6 @@ void MPIMediator::pull(rank_type from, const tensor_i & ary, const NDSlice & sli
     ser.adapter().flush();
 
     auto sz = slice.size() * ary.item_size();
-    std::cerr << "alsdkjf " << sz << " " << buff.size() << " " << rbuff << std::endl;
     MPI_Irecv(rbuff, sz, MPI_CHAR, from, PUSH_TAG, comm, &request[1]);
     MPI_Isend(buff.data(), buff.size(), MPI_CHAR, from, PULL_TAG, comm, &request[0]);
     auto error_code = MPI_Waitall(2, &request[0], &status[0]);
@@ -123,12 +123,16 @@ void MPIMediator::listen()
         MPI_Irecv(buff.data(), buff.size(), MPI_CHAR, MPI_ANY_SOURCE, PULL_TAG, comm, &request_in);
 
         // Now find the array in question and send back its bufferized slice
-        locker _l(ak_mutex);
-        auto x = s_ak.find(id);
-        if(x == s_ak.end()) throw(std::runtime_error("Encountered pull request for unknown tensor."));
+        array_keeper_type::iterator x;
+        tensor_i::ptr_type ptr;
+        {
+            locker _l(ak_mutex);
+            x = s_ak.find(id);
+            if(x == s_ak.end()) throw(std::runtime_error("Encountered pull request for unknown tensor."));
+            ptr = x->second.lock();
+        }
         // Wait for previous answer to complete so that we can re-use the buffer
         MPI_Wait(&request_out, MPI_STATUS_IGNORE);
-        auto ptr = x->second.lock();
         ptr->bufferize(slice, rbuff);
         if(slice.size() * ptr->item_size() != rbuff.size()) throw(std::runtime_error("Got unexpected buffer size."));
         MPI_Isend(rbuff.data(), rbuff.size(), MPI_CHAR, requester, PUSH_TAG, comm, &request_out);
diff --git a/src/Random.cpp b/src/Random.cpp
@@ -36,8 +36,10 @@ struct DeferredRandomOp : public Deferred
         switch(_dtype) {
         case FLOAT64:
             set_value(x::Rand<double>::op(_shape, _lower, _upper));
+            return;
         case FLOAT32:
             set_value(x::Rand<float>::op(_shape, _lower, _upper));
+            return;
         }
         throw std::runtime_error("rand: dtype must be a floating point type");
     }
diff --git a/src/SetGetItem.cpp b/src/SetGetItem.cpp
@@ -32,45 +32,33 @@ namespace x {
         // (DPTensorX<T> & dest, const NDSlice & dest_slice, const DPTensorX<U> & val, const NDSlice & val_slice)
         {
             // const PVSlice & org_slice = dest.slice();
-            std::cerr << "_set_slice " << org_slice << " " << dest_slice << " " << val->slice() << " " << val_slice << std::endl;
             auto nd = org_slice.ndims();
-            // if(dest.owner() == REPLICATED && nd > 0)
-            //     std::cerr << "Warning: __setitem__ on replicated data updates local tile only" << std::endl;
             if(nd != dest_slice.ndims())
                 throw std::runtime_error("Index dimensionality must match array dimensionality");
             if(val_slice.size() != dest_slice.size())
                 throw std::runtime_error("Input and output slices must be of same size");
 
             // Use given slice to create a global view into orig array
             PVSlice g_slc_view(org_slice, dest_slice);
-            std::cerr << "g_slice: " << g_slc_view.slice() << std::endl;
             // Create a view into val
             PVSlice needed_val_view(val->slice(), val_slice);
-            std::cerr << "needed_val_view: " << needed_val_view.slice() << " (was " << val->slice().slice() << ")" << std::endl;
 
             // we can now compute which ranks actually hold which piece of the data from val that we need locally
             for(rank_type i=0; i<theTransceiver->nranks(); ++i ) {
                 // get local view into val
                 PVSlice val_local_view(val->slice(), i);
-                std::cerr << i << " val_local_view: " << val_local_view.slice() << std::endl;
                 NDSlice curr_needed_val_slice = needed_val_view.slice_of_rank(i);
-                std::cerr << i << " curr_needed_val_slice: " << curr_needed_val_slice << std::endl;
                 NDSlice curr_local_val_slice = val_local_view.map_slice(curr_needed_val_slice);
-                std::cerr << i << " curr_local_val_slice: " << curr_local_val_slice << std::endl;
                 NDSlice curr_needed_norm_slice = needed_val_view.map_slice(curr_needed_val_slice);
-                std::cerr << i << " curr_needed_norm_slice: " << curr_needed_norm_slice << std::endl;
                 PVSlice my_curr_needed_view = PVSlice(g_slc_view, curr_needed_norm_slice);
-                std::cerr << i << " my_curr_needed_slice: " << my_curr_needed_view.slice() << std::endl;
                 NDSlice my_curr_local_slice = my_curr_needed_view.local_slice_of_rank(theTransceiver->rank());
-                std::cerr << i << " my_curr_local_slice: " << my_curr_local_slice << std::endl;
+
                 if(curr_needed_norm_slice.size()) {
                     py::tuple tpl = _make_tuple(my_curr_local_slice); //my_curr_view.slice());
                     if(i == theTransceiver->rank()) {
                         // copy locally
-                        std::cerr << "local copy\n";
                         auto to_v   = xt::strided_view(dest/*.xarray()*/, to_xt(my_curr_local_slice));
                         auto from_v = xt::strided_view(val->xarray(), to_xt(curr_local_val_slice));
-                        std::cerr << "to: " << to_v << std::endl << "from: " << from_v << std::endl;
                         to_v = from_v;
                     } else {
                         // pull slice directly into new array
@@ -92,13 +80,9 @@ namespace x {
         {
             // Use given slice to create a global view into orig array
             PVSlice g_slc_view(a_ptr->slice(), slice);
-            std::cerr << "g_slice: " << g_slc_view.slice() << std::endl;
             NDSlice my_slice = g_slc_view.slice_of_rank();
-            std::cerr << "my_slice: " << my_slice << std::endl;
             NDSlice my_norm_slice = g_slc_view.map_slice(my_slice);
-            std::cerr << "my_norm_slice: " << my_norm_slice << std::endl;
             NDSlice my_rel_slice = a_ptr->slice().map_slice(my_slice);
-            std::cerr << "my_rel_slice: " << my_rel_slice << std::endl;
             
             theTransceiver->barrier();
             _set_slice<A>(a_ptr->xarray(), a_ptr->slice(),
diff --git a/src/ddptensor.cpp b/src/ddptensor.cpp
@@ -21,6 +21,7 @@ using namespace pybind11::literals; // to bring _a
 
 #include "ddptensor/MPITransceiver.hpp"
 #include "ddptensor/MPIMediator.hpp"
+#include "ddptensor/Deferred.hpp"
 #include "ddptensor/Creator.hpp"
 #include "ddptensor/IEWBinOp.hpp"
 #include "ddptensor/EWBinOp.hpp"
@@ -41,10 +42,14 @@ rank_type myrank()
 
 Transceiver * theTransceiver = nullptr;
 Mediator * theMediator = nullptr;
+std::thread * pprocessor;
 
 // users currently need to call fini to make MPI terminate gracefully
 void fini()
 {
+    Deferred::defer(nullptr);
+    pprocessor->join();
+    delete pprocessor;
     delete theMediator;
     theMediator = nullptr;
     delete theTransceiver;
@@ -56,12 +61,14 @@ void fini()
 PYBIND11_MODULE(_ddptensor, m) {
     theTransceiver = new MPITransceiver();
     theMediator = new MPIMediator();
+    pprocessor = new std::thread(process_promises);
 
     m.doc() = "A partitioned and distributed tensor";
 
     def_enums(m);
 
     m.def("fini", &fini)
+        .def("sync", &sync)
         .def("myrank", &myrank)
         .def("_get_slice", &GetItem::get_slice)
         .def("_get_local", &GetItem::get_local);
diff --git a/src/include/ddptensor/Deferred.hpp b/src/include/ddptensor/Deferred.hpp
@@ -37,3 +37,6 @@ struct UnDeferred : public Deferred
     {
     }
 };
+
+extern void process_promises();
+extern void sync();
diff --git a/src/include/ddptensor/MPIMediator.hpp b/src/include/ddptensor/MPIMediator.hpp
@@ -13,7 +13,7 @@ class MPIMediator : public Mediator
     MPIMediator();
     virtual ~MPIMediator();
     virtual uint64_t register_array(tensor_i::ptr_type ary);
-    virtual uint64_t unregister_array(uint64_t);
+    virtual void unregister_array(uint64_t);
     virtual void pull(rank_type from, const tensor_i & ary, const NDSlice & slice, void * buffer);
 
 protected:
diff --git a/src/include/ddptensor/Mediator.hpp b/src/include/ddptensor/Mediator.hpp
@@ -13,7 +13,7 @@ class Mediator
     enum : uint64_t {LOCAL_ONLY = 0};
     virtual ~Mediator() {}
     virtual uint64_t register_array(tensor_i::ptr_type ary) = 0;
-    virtual uint64_t unregister_array(uint64_t) = 0;
+    virtual void unregister_array(uint64_t) = 0;
     virtual void pull(rank_type from, const tensor_i & ary, const NDSlice & slice, void * buffer) = 0;
 };
 
diff --git a/src/include/ddptensor/Slice.hpp b/src/include/ddptensor/Slice.hpp
@@ -231,7 +231,6 @@ struct Slice
     bool covers(const value_type & i) const
     {
         assert(_step > 0 && _start >= 0);
-        // std::cerr << "\t\t\t" << i << " " << _start << " " << _end << " " << _step << " " << ((i - (_start % _step)) % _step) << std::endl;
         return (i >= _start
                 && i < _end
                 && ((i - (_start % _step)) % _step) == 0);
diff --git a/src/include/ddptensor/x.hpp b/src/include/ddptensor/x.hpp
@@ -123,7 +123,7 @@ namespace x
 
         ~DPTensorX()
         {
-            if(_id != Mediator::LOCAL_ONLY && theMediator) theMediator->unregister_array(_id);
+            // if(_id != Mediator::LOCAL_ONLY && theMediator) theMediator->unregister_array(_id);
         }
 
         bool is_sliced() const
@@ -221,12 +221,10 @@ namespace x
 
         T replicate() const
         {
-            std::cerr << "is_replicated()=" << is_replicated() << " owner=" << owner() << " shape=" << to_string(shape()) << std::endl;
             if(is_replicated()) return _replica;
             if(has_owner() && _slice.size() == 1) {
                 if(theTransceiver->rank() == owner()) {
                     _replica = *(xt::strided_view(xarray(), lslice()).begin());
-                    std::cerr << "replica: " << _replica << std::endl;
                 }
                 theTransceiver->bcast(&_replica, sizeof(T), owner());
                 set_owner(REPLICATED);
@@ -255,8 +253,6 @@ namespace x
         {
             NDSlice lslice = NDSlice(slice().shape_of_rank()).slice(slc);
 
-            std::cerr << "lslice=" << lslice << " slc= " << slc << " buffsz=" << buff.size() << " want " << slc.size()*sizeof(T) << std::endl;
-
             auto ary_v = xt::strided_view(xarray(), to_xt(lslice));
             buff.resize(slc.size()*sizeof(T));
             T * out = reinterpret_cast<T*>(buff.data());
diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -18,5 +18,5 @@
 b = np.reshape(b, (7,3))
 c = np.dot(a, b)
 print(c)
-
+dt.sync()
 dt.fini()

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ namespace x {`
`43`	`43`
`44`	`44`	`static ptr_type op(uint64_t start, uint64_t end, uint64_t step)`
`45`	`45`	`{`
`46`		`- PVSlice pvslice({Slice(start, end, step).size()});`
	`46`	`+ PVSlice pvslice({static_cast<uint64_t>(Slice(start, end, step).size())});`
`47`	`47`	`auto lslc = pvslice.slice_of_rank();`
`48`	`48`	`const auto & l1dslc = lslc.dim(0);`
`49`	`49`	`auto a = xt::arange<T>(start + l1dslc._startstep, start + l1dslc._end step, l1dslc._step);`
Original file line number	Diff line number	Diff line change
`@@ -36,8 +36,10 @@ struct DeferredRandomOp : public Deferred`
`36`	`36`	`switch(_dtype) {`
`37`	`37`	`case FLOAT64:`
`38`	`38`	`set_value(x::Rand<double>::op(_shape, _lower, _upper));`
	`39`	`+ return;`
`39`	`40`	`case FLOAT32:`
`40`	`41`	`set_value(x::Rand<float>::op(_shape, _lower, _upper));`
	`42`	`+ return;`
`41`	`43`	`}`
`42`	`44`	`throw std::runtime_error("rand: dtype must be a floating point type");`
`43`	`45`	`}`
Original file line number	Diff line number	Diff line change
`@@ -37,3 +37,6 @@ struct UnDeferred : public Deferred`
`37`	`37`	`{`
`38`	`38`	`}`
`39`	`39`	`};`
	`40`	`+`
	`41`	`+extern void process_promises();`
	`42`	`+extern void sync();`