adding numpy.fromfunction; acquire gil for immediate execution/run, relase when waiting

fschlimb · fschlimb · commit 172c55d9e81c · 2023-03-23T10:42:38.000-05:00
diff --git a/ddptensor/numpy/__init__.py b/ddptensor/numpy/__init__.py
@@ -1 +1,7 @@
-import dtensor
+from .. import empty, float32
+
+
+def fromfunction(function, shape, *, dtype=float32):
+    t = empty(shape, dtype)
+    t._t.map(function)
+    return t
diff --git a/src/Creator.cpp b/src/Creator.cpp
@@ -151,8 +151,11 @@ struct DeferredFull : public Deferred {
     ::imex::ptensor::DType dtyp;
     ::mlir::Value val = dispatch<ValAndDType>(_dtype, builder, loc, _val, dtyp);
 
-    auto team = ::imex::createIndex(
-        loc, builder, reinterpret_cast<uint64_t>(getTransceiver()));
+    auto team = /*getTransceiver()->nranks() <= 1
+                  ? ::mlir::Value()
+                  :*/
+        ::imex::createIndex(loc, builder,
+                            reinterpret_cast<uint64_t>(getTransceiver()));
 
     dm.addVal(this->guid(),
               builder.create<::imex::ptensor::CreateOp>(loc, shp, dtyp, val,
@@ -206,8 +209,12 @@ struct DeferredArange : public Deferred {
     auto stop = ::imex::createInt(loc, builder, _end);
     auto step = ::imex::createInt(loc, builder, _step);
     // ::mlir::Value
-    auto team = ::imex::createIndex(
-        loc, builder, reinterpret_cast<uint64_t>(getTransceiver()));
+    auto team = /*getTransceiver()->nranks() <= 1
+                ? ::mlir::Value()
+                :*/
+        ::imex::createIndex(loc, builder,
+                            reinterpret_cast<uint64_t>(getTransceiver()));
+
     dm.addVal(this->guid(),
               builder.create<::imex::ptensor::ARangeOp>(loc, start, stop, step,
                                                         nullptr, team),
diff --git a/src/Deferred.cpp b/src/Deferred.cpp
@@ -19,6 +19,9 @@
 #include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 #include <oneapi/tbb/concurrent_queue.h>
 
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
 #include <iostream>
 
 // thread-safe FIFO queue holding deferred objects
@@ -148,9 +151,10 @@ void process_promises() {
     } // no else needed
 
     // now we execute the deferred action which could not be compiled
-    if (d)
+    if (d) {
+      py::gil_scoped_acquire acquire;
       d->run();
+      d.reset();
+    }
   } while (!done);
 }
-
-void sync_promises() { (void)Service::run().get(); }
diff --git a/src/SetGetItem.cpp b/src/SetGetItem.cpp
@@ -167,6 +167,21 @@ template <typename T> struct wrap_array {
   }
 };
 
+py::object wrap(DDPTensorImpl::ptr_type tnsr, const py::handle &handle) {
+  auto tmp_shp = tnsr->local_shape();
+  auto tmp_str = tnsr->local_strides();
+  auto nd = tnsr->ndims();
+  auto eSz = sizeof_dtype(tnsr->dtype());
+  std::vector<ssize_t> strides(nd);
+  for (auto i = 0; i < nd; ++i) {
+    strides[i] = eSz * tmp_str[i];
+  }
+
+  return dispatch<wrap_array>(tnsr->dtype(),
+                              std::vector<ssize_t>(tmp_shp, &tmp_shp[nd]),
+                              strides, tnsr->data(), handle);
+}
+
 // ***************************************************************************
 
 struct DeferredGetLocal
@@ -182,17 +197,7 @@ struct DeferredGetLocal
     auto aa = std::move(Registry::get(_a).get());
     auto a_ptr = std::dynamic_pointer_cast<DDPTensorImpl>(aa);
     assert(a_ptr);
-    auto tmp_shp = a_ptr->local_shape();
-    auto tmp_str = a_ptr->local_strides();
-    auto nd = a_ptr->ndims();
-    auto eSz = sizeof_dtype(a_ptr->dtype());
-    std::vector<ssize_t> strides(nd);
-    for (auto i = 0; i < nd; ++i) {
-      strides[i] = eSz * tmp_str[i];
-    }
-    auto res = dispatch<wrap_array>(a_ptr->dtype(),
-                                    std::vector<ssize_t>(tmp_shp, &tmp_shp[nd]),
-                                    strides, a_ptr->data(), _handle);
+    auto res = wrap(a_ptr, _handle);
     set_value(res);
   }
 
@@ -317,6 +322,58 @@ struct DeferredSetItem : public Deferred {
 
 // ***************************************************************************
 
+struct DeferredMap : public Deferred {
+  id_type _a;
+  py::object _func;
+
+  DeferredMap() = default;
+  DeferredMap(const tensor_i::future_type &a, py::object &func)
+      : Deferred(a.id(), a.dtype(), a.rank(), a.balanced()), _a(a.id()),
+        _func(func) {}
+
+  void run() override {
+    auto aa = std::move(Registry::get(_a).get());
+    auto a_ptr = std::dynamic_pointer_cast<DDPTensorImpl>(aa);
+    assert(a_ptr);
+    auto nd = a_ptr->ndims();
+    auto lOffs = a_ptr->local_offsets();
+    std::vector<int64_t> lIdx(nd);
+    std::vector<int64_t> gIdx(nd);
+
+    dispatch(a_ptr->dtype(), a_ptr->data(), [&](auto *ptr) {
+      forall(
+          0, ptr, a_ptr->local_shape(), a_ptr->local_strides(), nd, lIdx,
+          [&](const std::vector<int64_t> &idx, auto *elPtr) {
+            for (auto i = 0; i < nd; ++i) {
+              gIdx[i] = idx[i] + lOffs[i];
+            }
+            auto pyIdx = _make_tuple(gIdx);
+            *elPtr =
+                _func(*pyIdx)
+                    .cast<
+                        typename std::remove_pointer<decltype(elPtr)>::type>();
+          });
+    });
+
+    this->set_value(aa);
+  };
+
+  bool generate_mlir(::mlir::OpBuilder &builder, ::mlir::Location loc,
+                     jit::DepManager &dm) override {
+    return true;
+  }
+
+  FactoryId factory() const { return F_MAP; }
+
+  template <typename S> void serialize(S &ser) {
+    assert(false);
+    ser.template value<sizeof(_a)>(_a);
+    // nope ser.template value<sizeof(_func)>(_func);
+  }
+};
+
+// ***************************************************************************
+
 struct DeferredGetItem : public Deferred {
   id_type _a;
   NDSlice _slc;
@@ -407,14 +464,18 @@ GetItem::py_future_type GetItem::gather(const ddptensor &a, rank_type root) {
 
 ddptensor *SetItem::__setitem__(ddptensor &a, const std::vector<py::slice> &v,
                                 const py::object &b) {
-
   auto bb = Creator::mk_future(b);
   a.put(defer<DeferredSetItem>(a.get(), bb.first->get(), v));
   if (bb.second)
     delete bb.first;
   return &a;
 }
 
+ddptensor *SetItem::map(ddptensor &a, py::object &b) {
+  a.put(defer<DeferredMap>(a.get(), b));
+  return &a;
+}
+
 py::object GetItem::get_slice(const ddptensor &a,
                               const std::vector<py::slice> &v) {
   const auto aa = std::move(a.get());
@@ -423,5 +484,6 @@ py::object GetItem::get_slice(const ddptensor &a,
 
 FACTORY_INIT(DeferredGetItem, F_GETITEM);
 FACTORY_INIT(DeferredSetItem, F_SETITEM);
+FACTORY_INIT(DeferredMap, F_MAP);
 FACTORY_INIT(DeferredGather, F_GATHER);
-FACTORY_INIT(DeferredGather, F_GETLOCAL);
+FACTORY_INIT(DeferredGetLocal, F_GETLOCAL);
diff --git a/src/ddptensor.cpp b/src/ddptensor.cpp
@@ -55,6 +55,7 @@ extern bool finied;
 
 // users currently need to call fini to make MPI terminate gracefully
 void fini() {
+  py::gil_scoped_release release;
   if (finied)
     return;
   fini_mediator(); // stop task is sent in here
@@ -92,15 +93,22 @@ void init(bool cw) {
   finied = false;
 }
 
+void sync_promises() {
+  py::gil_scoped_release release;
+  (void)Service::run().get();
+}
+
 // #########################################################################
 
 /// trigger compile&run and return future value
 #define PY_SYNC_RETURN(_f)                                                     \
+  py::gil_scoped_release release;                                              \
   Service::run();                                                              \
   return (_f).get()
 
 /// trigger compile&run and return given attribute _x
 #define SYNC_RETURN(_f, _a)                                                    \
+  py::gil_scoped_release release;                                              \
   Service::run();                                                              \
   return (_f).get().get()->_a()
 
@@ -188,7 +196,8 @@ PYBIND11_MODULE(_ddptensor, m) {
            [](const ddptensor &f) { REPL_SYNC_RETURN(f, __int__); })
       // attributes returning a new ddptensor
       .def("__getitem__", &GetItem::__getitem__)
-      .def("__setitem__", &SetItem::__setitem__);
+      .def("__setitem__", &SetItem::__setitem__)
+      .def("map", &SetItem::map);
 #undef REPL_SYNC_RETURN
 #undef SYNC_RETURN
 
diff --git a/src/include/ddptensor/CppTypes.hpp b/src/include/ddptensor/CppTypes.hpp
@@ -198,19 +198,20 @@ enum FactoryId : int {
   F_EWUNYOP,
   F_FROMSHAPE,
   F_FULL,
+  F_GATHER,
   F_GETITEM,
+  F_GETLOCAL,
   F_IEWBINOP,
   F_LINALGOP,
   F_MANIPOP,
+  F_MAP,
   F_RANDOM,
   F_REDUCEOP,
+  F_REPLICATE,
   F_SERVICE,
   F_SETITEM,
   F_SORTOP,
   F_UNYOP,
-  F_GATHER,
-  F_GETLOCAL,
-  F_REPLICATE,
   FACTORY_LAST
 };
 
diff --git a/src/include/ddptensor/DDPTensorImpl.hpp b/src/include/ddptensor/DDPTensorImpl.hpp
@@ -209,20 +209,41 @@ template <typename... Ts> static tensor_i::future_type mk_ftx(Ts &&...args) {
 
 // execute an OP on all elements of a tensor represented by
 // dimensionality/ptr/sizes/strides.
-template <typename T, typename OP>
-void forall(uint64_t d, const T *cptr, const int64_t *sizes,
-            const int64_t *strides, uint64_t nd, OP op) {
+template <typename T, typename OP, bool PASSIDX>
+void forall_(uint64_t d, T *cptr, const int64_t *sizes, const int64_t *strides,
+             uint64_t nd, OP op, std::vector<int64_t> *idx) {
+  assert(!PASSIDX || idx);
   auto stride = strides[d];
   auto sz = sizes[d];
   if (d == nd - 1) {
     for (auto i = 0; i < sz; ++i) {
-      op(&cptr[i * stride]);
+      if constexpr (PASSIDX) {
+        (*idx)[d] = i;
+        op(*idx, &cptr[i * stride]);
+      } else if constexpr (!PASSIDX) {
+        op(&cptr[i * stride]);
+      }
     }
   } else {
     for (auto i = 0; i < sz; ++i) {
-      const T *tmp = cptr;
-      forall(d + 1, cptr, sizes, strides, nd, op);
+      T *tmp = cptr;
+      if constexpr (PASSIDX) {
+        (*idx)[d] = i;
+      }
+      forall_<T, OP, PASSIDX>(d + 1, cptr, sizes, strides, nd, op, idx);
       cptr = tmp + strides[d];
     }
   }
 }
+
+template <typename T, typename OP>
+void forall(uint64_t d, T *cptr, const int64_t *sizes, const int64_t *strides,
+            uint64_t nd, OP op) {
+  forall_<T, OP, false>(d, cptr, sizes, strides, nd, op, nullptr);
+}
+
+template <typename T, typename OP>
+void forall(uint64_t d, T *cptr, const int64_t *sizes, const int64_t *strides,
+            uint64_t nd, std::vector<int64_t> &idx, OP op) {
+  forall_<T, OP, true>(d, cptr, sizes, strides, nd, op, &idx);
+}
diff --git a/src/include/ddptensor/Deferred.hpp b/src/include/ddptensor/Deferred.hpp
@@ -16,7 +16,6 @@
 #include "tensor_i.hpp"
 
 extern void process_promises();
-extern void sync_promises();
 
 // interface for promises/tasks to generate MLIR or execute immediately.
 struct Runable {
diff --git a/src/include/ddptensor/PyTypes.hpp b/src/include/ddptensor/PyTypes.hpp
@@ -115,6 +115,12 @@ template <typename T> py::tuple _make_tuple(const std::vector<T> &v) {
       [](const V &v, int i) { return v[i]; });
 }
 
+template <typename T> py::tuple _make_tuple(const T ptr, size_t n) {
+  return _make_tuple(
+      ptr, [n](const T &) { return n; },
+      [](const T &v, int i) { return v[i]; });
+}
+
 template <typename T> T to_native(const py::object &o) { return o.cast<T>(); }
 
 inline void compute_slice(const py::slice &slc, uint64_t &offset,
diff --git a/src/include/ddptensor/SetGetItem.hpp b/src/include/ddptensor/SetGetItem.hpp
@@ -26,4 +26,5 @@ struct GetItem {
 struct SetItem {
   static ddptensor *__setitem__(ddptensor &a, const std::vector<py::slice> &v,
                                 const py::object &b);
+  static ddptensor *map(ddptensor &a, py::object &b);
 };
diff --git a/test/stencil-2d.py b/test/stencil-2d.py
@@ -67,6 +67,7 @@
 
 import numpy
 import ddptensor as np
+import ddptensor.numpy
 
 # print('np version  = ', np.version.version)
 
@@ -123,6 +124,7 @@ def main():
     W = np.zeros(((2 * r + 1), (2 * r + 1)), dtype=np.float64)
     A = np.empty((n, n), dtype=np.float64)
     B = np.zeros((n, n), dtype=np.float64)
+
     if pattern == "star":
         stencil_size = 4 * r + 1
         for i in range(1, r + 1):
@@ -143,10 +145,7 @@ def main():
             W[r + j, r + j] = +1.0 / (4 * j * r)
             W[r - j, r - j] = -1.0 / (4 * j * r)
 
-    # A = numpy.fromfunction(lambda i,j: i+j, (n,n), dtype=float)
-    for i in range(n):
-        for j in range(n):
-            A[i, j] = float(i + j)
+    A = np.numpy.fromfunction(lambda i, j: i + j, (n, n), dtype=np.float64)
 
     for k in range(iterations + 1):
         # start timer after a warmup iteration
@@ -204,12 +203,11 @@ def main():
     reference_norm = 2 * (iterations + 1)
     if abs(norm - reference_norm) < epsilon:
         print("Solution validates")
-        flops = (2 * stencil_size + 1) * active_points
-        avgtime = stencil_time / iterations
-        print("Rate (MFlops/s): ", 1.0e-6 * flops / avgtime, " Avg time (s): ", avgtime)
     else:
         print("ERROR: L1 norm = ", norm, " Reference L1 norm = ", reference_norm)
-        sys.exit()
+    flops = (2 * stencil_size + 1) * active_points
+    avgtime = stencil_time / iterations
+    print("Rate (MFlops/s): ", 1.0e-6 * flops / avgtime, " Avg time (s): ", avgtime)
 
 
 if __name__ == "__main__":