adjust handling of 0d arrays, somd GC improvements

fschlimb · fschlimb · commit 5f57e986b491 · 2022-12-20T04:24:33.000-06:00
diff --git a/src/DDPTensorImpl.cpp b/src/DDPTensorImpl.cpp
@@ -106,7 +106,8 @@ std::string DDPTensorImpl::__repr__() const
 
     dispatch(_dtype, _aligned, [this, nd, &oss](auto * ptr) {
         auto cptr = ptr + this->_offset;
-        printit(oss, 0, cptr);
+        if(nd>0) printit(oss, 0, cptr);
+        else oss << *cptr;
     });
     return oss.str();
 }
@@ -158,32 +159,33 @@ void DDPTensorImpl::bufferize(const NDSlice & slc, Buffer & buff) const
 
 void DDPTensorImpl::add_to_args(std::vector<void*> & args, int ndims)
 {
-    assert(ndims == this->ndims() || (ndims == 0 && this->ndims() == 1));
-    // global shape first
-    intptr_t * buff = new intptr_t[dtensor_sz(1)];
+    assert(ndims == this->ndims());
+    // local tensor first
+    intptr_t * buff = new intptr_t[dtensor_sz(ndims)];
+    buff[0] = reinterpret_cast<intptr_t>(_allocated);
+    buff[1] = reinterpret_cast<intptr_t>(_aligned);
+    buff[2] = static_cast<intptr_t>(_offset);
+    memcpy(buff+3, _sizes, ndims*sizeof(intptr_t));
+    memcpy(buff+3+ndims, _strides, ndims*sizeof(intptr_t));
+    args.push_back(buff);
+    // second the team
+    args.push_back(reinterpret_cast<void*>(1));
+    if(ndims > 0)
+    // global shape third
+    buff = new intptr_t[dtensor_sz(1)];
     buff[0] = reinterpret_cast<intptr_t>(_gs_allocated);
     buff[1] = reinterpret_cast<intptr_t>(_gs_aligned);
     buff[2] = 0;
     buff[3] = ndims;
     buff[4] = 1;
     args.push_back(buff);
     assert(5 == memref_sz(1));
-    // local tensor
-    buff = new intptr_t[dtensor_sz(ndims)];
-    buff[0] = reinterpret_cast<intptr_t>(_allocated);
-    buff[1] = reinterpret_cast<intptr_t>(_aligned);
-    buff[2] = static_cast<intptr_t>(_offset);
-    memcpy(buff+3, _sizes, ndims*sizeof(intptr_t));
-    memcpy(buff+3+ndims, _strides, ndims*sizeof(intptr_t));
-    args.push_back(buff);
-    // local offsets
+    // local offsets last
     buff = new intptr_t[dtensor_sz(1)];
     buff[0] = reinterpret_cast<intptr_t>(_lo_allocated);
     buff[1] = reinterpret_cast<intptr_t>(_lo_aligned);
     buff[2] = 0;
     buff[3] = ndims;
     buff[4] = 1;
     args.push_back(buff);
-    // finally the team
-    args.push_back(reinterpret_cast<void*>(1));
 }
diff --git a/src/Deferred.cpp b/src/Deferred.cpp
@@ -65,7 +65,6 @@ void process_promises()
 {
     bool done = false;
     jit::JIT jit;
-
     do {
         ::mlir::OpBuilder builder(&jit._context);
         auto loc = builder.getUnknownLoc();
diff --git a/src/ddptensor.cpp b/src/ddptensor.cpp
@@ -49,7 +49,7 @@ rank_type myrank()
     return getTransceiver()->rank();
 }
 
-std::thread * pprocessor;
+std::thread * pprocessor = nullptr;
 
 extern bool inited;
 extern bool finied;
@@ -63,6 +63,7 @@ void fini()
         if(getTransceiver()->nranks() == 1) defer(nullptr);
         pprocessor->join();
         delete pprocessor;
+        pprocessor = nullptr;
     }
     fini_transceiver();
     Deferred::fini();
diff --git a/src/idtr.cpp b/src/idtr.cpp
@@ -124,12 +124,11 @@ void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, int op)
     getTransceiver()->reduce_all(inout, dtype, N, mlir2ddpt(static_cast<imex::ptensor::ReduceOpId>(op)));
 }
 
-// FIXME hard-coded 0d tensor
-void _idtr_reduce_all(uint64_t rank, uint64_t * mrd, DTypeId dtype, int op)
+// FIXME hard-coded for contiguous layout
+void _idtr_reduce_all(uint64_t rank, void * data, int64_t * sizes, int64_t * strides, DTypeId dtype, int op)
 {
-    assert(rank==0);
-    auto descr = reinterpret_cast<jit::JIT::MemRefDescriptor<uint64_t, 0>*>(mrd);
-    idtr_reduce_all(descr->aligned + descr->offset, dtype, 1, op);
+    assert(rank == 0 || strides[rank-1] == 1);
+    idtr_reduce_all(data, dtype, rank ? rank : 1, op);
 }
 
 } // extern "C"
diff --git a/src/include/ddptensor/jit/mlir.hpp b/src/include/ddptensor/jit/mlir.hpp
@@ -50,12 +50,14 @@ void init();
 class DepManager
 {
 private:
-    using IdValueMap = std::unordered_map<id_type, std::pair<::mlir::Value, SetResFunc>>;
+    using IdValueMap = std::unordered_map<id_type, ::mlir::Value>;
+    using IdCallbackMap = std::unordered_map<id_type, SetResFunc>;
     using IdRankMap = std::unordered_map<id_type, int>;
     using ArgList = std::vector<std::pair<id_type, int>>;
 
     ::mlir::func::FuncOp & _func; // MLIR function to which ops are added
-    IdValueMap _ivm;              // guid -> {mlir::Value, deliver-callback}
+    IdValueMap _ivm;              // guid -> mlir::Value
+    IdCallbackMap _icm;           // guid -> deliver-callback
     IdRankMap _irm;               // guid -> rank as computed in MLIR
     ArgList _args;                // input arguments of the generated function
 
diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp
@@ -128,10 +128,10 @@ ::mlir::Value DepManager::getDependent(::mlir::OpBuilder & builder, id_type guid
         _func.insertArgument(idx, typ, {}, loc);
         auto val = _func.getArgument(idx);
         _args.push_back({guid, fut.rank()});
-        _ivm[guid] = {val, {}};
+        _ivm[guid] = val;
         return val;
     } else {
-        return d->second.first;
+        return d->second;
     }
 }
 
@@ -151,22 +151,23 @@ std::vector<void*> DepManager::store_inputs()
         auto f = Registry::get(a.first);
         f.get().get()->add_to_args(res, a.second);
         _ivm.erase(a.first); // inputs need no delivery
+        _icm.erase(a.first);
     }
     return res;
 }
 
 void DepManager::addVal(id_type guid, ::mlir::Value val, SetResFunc cb)
 {
     assert(_ivm.find(guid) == _ivm.end());
-    _ivm[guid] = {val, cb};
+    _ivm[guid] = val;
+    _icm[guid] = cb;
 }
 
 void DepManager::drop(id_type guid)
 {
-    if(auto e = _ivm.find(guid); e != _ivm.end()) {
-        _ivm.erase(e);
-        // FIXME create delete op
-    }
+    _ivm.erase(guid);
+    _icm.erase(guid);
+    // FIXME create delete op
 }
 
 // Now we have to define the return type as a ValueRange of all arrays which we have created
@@ -186,7 +187,7 @@ uint64_t DepManager::handleResult(::mlir::OpBuilder & builder)
     uint64_t sz = 0;
     unsigned idx = 0;
     for(auto & v : _ivm) {
-        ::mlir::Value value = v.second.first;
+        ::mlir::Value value = v.second;
         // append the type and array/value
         auto retDtTyp = value.getType().dyn_cast<::imex::dist::DistTensorType>();
         if(!retDtTyp) {
@@ -207,44 +208,49 @@ uint64_t DepManager::handleResult(::mlir::OpBuilder & builder)
     // add return statement
     auto ret_value = builder.create<::mlir::func::ReturnOp>(builder.getUnknownLoc(), ret_values);
 
+    // clear any reference to MLIR values
+    _ivm.clear();
     return sz;
 }
 
 void DepManager::deliver(intptr_t * output, uint64_t sz)
 {
     size_t pos = 0;
-    for(auto & v : _ivm) {
-        auto value = v.second.first;
+    for(auto & v : _icm) {
         auto rank = _irm[v.first];
-        // first extract global shape
-        uint64_t * gs_allocated = reinterpret_cast<uint64_t*>(output[pos]);
-        uint64_t * gs_aligned = reinterpret_cast<uint64_t*>(output[pos+1]);
-        intptr_t gs_offset = output[pos+2];
-        // no sizes/stride needed
-        pos += memref_sz(1);
-        // second extract tensor
+        // first extract tensor
         void * t_allocated = reinterpret_cast<void*>(output[pos]);
         void * t_aligned = reinterpret_cast<void*>(output[pos+1]);
         intptr_t t_offset = output[pos+2];
         intptr_t * t_sizes = output + pos + 3;
         intptr_t * t_stride = output + pos + 3 + rank;
         pos += memref_sz(rank);
-        // third extract local offsets
-        uint64_t * lo_allocated = reinterpret_cast<uint64_t*>(output[pos]);
-        uint64_t * lo_aligned = reinterpret_cast<uint64_t*>(output[pos+1]);
-        intptr_t lo_offset = output[pos+2];
-        // no sizes/stride needed
-        pos += memref_sz(1);
-        // last is the team
+        // second is the team
         // auto team = output[pos];
         pos += 1;
-        // call finalization
-        v.second.second(
-            rank,
-            t_allocated, t_aligned, t_offset, t_sizes, t_stride, // tensor
-            gs_allocated, gs_aligned + gs_offset, // global shape is 1d tensor of uint64_t
-            lo_allocated, lo_aligned + lo_offset  // local offset is 1d tensor of uint64_t
-        );
+        if(rank > 0) {
+            // third extract global shape
+            uint64_t * gs_allocated = reinterpret_cast<uint64_t*>(output[pos]);
+            uint64_t * gs_aligned = reinterpret_cast<uint64_t*>(output[pos+1]);
+            intptr_t gs_offset = output[pos+2];
+            // no sizes/stride needed
+            pos += memref_sz(1);
+            // lastly extract local offsets
+            uint64_t * lo_allocated = reinterpret_cast<uint64_t*>(output[pos]);
+            uint64_t * lo_aligned = reinterpret_cast<uint64_t*>(output[pos+1]);
+            intptr_t lo_offset = output[pos+2];
+            // no sizes/stride needed
+            pos += memref_sz(1);
+            // call finalization
+            v.second(rank,
+                     t_allocated, t_aligned, t_offset, t_sizes, t_stride, // tensor
+                     gs_allocated, gs_aligned + gs_offset, // global shape is 1d tensor of uint64_t
+                     lo_allocated, lo_aligned + lo_offset  // local offset is 1d tensor of uint64_t
+            );
+        } else { // 0d tensor
+            v.second(rank, t_allocated, t_aligned, t_offset, t_sizes, t_stride,
+                     nullptr, nullptr, nullptr, nullptr);
+        }
     }
 }
 
@@ -296,8 +302,9 @@ int JIT::run(::mlir::ModuleOp & module, const std::string & fname, std::vector<v
 static const char * pass_pipeline =
    getenv("DDPT_PASSES")
    ? getenv("DDPT_PASSES")
-   : "func.func(ptensor-dist),convert-dist-to-standard,convert-ptensor-to-linalg,arith-expand,canonicalize,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,scf-bufferize,linalg-bufferize,tensor-bufferize),func-bufferize,canonicalize,func.func(finalizing-bufferize,convert-linalg-to-parallel-loops),canonicalize,fold-memref-alias-ops,lower-affine,convert-scf-to-cf,convert-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts";
-   
+//    : "func.func(ptensor-dist),convert-dist-to-standard,convert-ptensor-to-linalg,arith-expand,canonicalize,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,scf-bufferize,linalg-bufferize,tensor-bufferize),func-bufferize,canonicalize,func.func(finalizing-bufferize,convert-linalg-to-parallel-loops),canonicalize,fold-memref-alias-ops,lower-affine,convert-scf-to-cf,convert-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts";
+//    : "builtin.module(func.func(ptensor-dist),convert-dist-to-standard,convert-ptensor-to-linalg,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,scf-bufferize,linalg-bufferize,tensor-bufferize,bufferization-bufferize),func-bufferize,func.func(finalizing-bufferize,convert-linalg-to-parallel-loops),canonicalize,fold-memref-alias-ops,expand-strided-metadata,lower-affine,convert-scf-to-cf,convert-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)";
+   : "func.func(ptensor-dist),convert-dist-to-standard,convert-ptensor-to-linalg,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,scf-bufferize,linalg-bufferize,tensor-bufferize,bufferization-bufferize),func-bufferize,func.func(finalizing-bufferize,convert-linalg-to-parallel-loops),canonicalize,fold-memref-alias-ops,expand-strided-metadata,lower-affine,convert-scf-to-cf,convert-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts";
 JIT::JIT()
     : _context(::mlir::MLIRContext::Threading::DISABLED),
       _pm(&_context),

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,6 @@ void process_promises()`
`65`	`65`	`{`
`66`	`66`	`bool done = false;`
`67`	`67`	`jit::JIT jit;`
`68`		`-`
`69`	`68`	`do {`
`70`	`69`	`::mlir::OpBuilder builder(&jit._context);`
`71`	`70`	`auto loc = builder.getUnknownLoc();`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ rank_type myrank()`
`49`	`49`	`return getTransceiver()->rank();`
`50`	`50`	`}`
`51`	`51`
`52`		`-std::thread * pprocessor;`
	`52`	`+std::thread * pprocessor = nullptr;`
`53`	`53`
`54`	`54`	`extern bool inited;`
`55`	`55`	`extern bool finied;`
`@@ -63,6 +63,7 @@ void fini()`
`63`	`63`	`if(getTransceiver()->nranks() == 1) defer(nullptr);`
`64`	`64`	`pprocessor->join();`
`65`	`65`	`delete pprocessor;`
	`66`	`+ pprocessor = nullptr;`
`66`	`67`	`}`
`67`	`68`	`fini_transceiver();`
`68`	`69`	`Deferred::fini();`
Original file line number	Diff line number	Diff line change
`@@ -124,12 +124,11 @@ void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, int op)`
`124`	`124`	`getTransceiver()->reduce_all(inout, dtype, N, mlir2ddpt(static_cast<imex::ptensor::ReduceOpId>(op)));`
`125`	`125`	`}`
`126`	`126`
`127`		`-// FIXME hard-coded 0d tensor`
`128`		`-void _idtr_reduce_all(uint64_t rank, uint64_t * mrd, DTypeId dtype, int op)`
	`127`	`+// FIXME hard-coded for contiguous layout`
	`128`	`+void _idtr_reduce_all(uint64_t rank, void * data, int64_t * sizes, int64_t * strides, DTypeId dtype, int op)`
`129`	`129`	`{`
`130`		`- assert(rank==0);`
`131`		`- auto descr = reinterpret_cast<jit::JIT::MemRefDescriptor<uint64_t, 0>*>(mrd);`
`132`		`- idtr_reduce_all(descr->aligned + descr->offset, dtype, 1, op);`
	`130`	`+ assert(rank == 0 \|\| strides[rank-1] == 1);`
	`131`	`+ idtr_reduce_all(data, dtype, rank ? rank : 1, op);`
`133`	`132`	`}`
`134`	`133`
`135`	`134`	`} // extern "C"`