handling input args to jit-compiled function

fschlimb · fschlimb · commit 737de51ea2b7 · 2022-08-26T03:07:02.000-05:00
diff --git a/src/Deferred.cpp b/src/Deferred.cpp
@@ -106,20 +106,24 @@ void process_promises()
         if(runables.empty()) continue;
 
         // create return statement and adjust function type
-        uint64_t sz = dm.handleResult(builder);
+        uint64_t osz = dm.handleResult(builder);
         // also request generation of c-wrapper function
         function->setAttr(::mlir::LLVM::LLVMDialect::getEmitCWrapperAttrName(), ::mlir::UnitAttr::get(&jit._context));
         // add the function to the module
         module.push_back(function);
         module.dump();
 
+        // get input buffers (before rsults!)
+        auto input = std::move(dm.store_inputs());
+
         // compile and run the module
-        assert(sizeof(intptr_t) == sizeof(void*));
-        intptr_t * output = new intptr_t[sz];
-        if(jit.run(module, fname, output)) throw std::runtime_error("failed running jit");
+        intptr_t * output = new intptr_t[osz];
+        if(jit.run(module, fname, input, output)) throw std::runtime_error("failed running jit");
 
         // push results to deliver promises
-        dm.deliver(output, sz);
+        dm.deliver(output, osz);
+
+        delete [] output;
     } while(!done);
 }
 
diff --git a/src/include/ddptensor/DDPTensorImpl.hpp b/src/include/ddptensor/DDPTensorImpl.hpp
@@ -29,6 +29,8 @@ class DDPTensorImpl : public tensor_i
     PVSlice _slice;
     void * _allocated;
     void * _aligned;
+    intptr_t * _sizes;
+    intptr_t * _strides;
     uint64_t _offset;
     DTypeId _dtype;
 
@@ -42,13 +44,17 @@ class DDPTensorImpl : public tensor_i
         : _owner(owner),
           _slice(shape_type(rank ? rank : 1, rank ? sizes[0] : 1), static_cast<int>(owner==REPLICATED ? NOSPLIT : 0)),
           _allocated(allocated),
-          _aligned(nullptr),
+          _aligned(aligned),
+          _sizes(new intptr_t[rank]),
+          _strides(new intptr_t[rank]),
           _offset(offset),
           _dtype(dtype)
     {
         assert(rank <= 1);
         assert(rank == 0 || strides[0] == 1);
-        dispatch(_dtype, aligned, [this](auto * ptr) { this->_aligned = ptr + this->_offset; });
+
+        memcpy(_sizes, sizes, rank*sizeof(intptr_t));
+        memcpy(_strides, strides, rank*sizeof(intptr_t));
     }
 
     DDPTensorImpl(DTypeId dtype, const shape_type & shp, rank_type owner=NOOWNER)
@@ -60,23 +66,38 @@ class DDPTensorImpl : public tensor_i
           _dtype(dtype)
     {
         alloc();
+
+        intptr_t stride = 1;
+        auto rank = shp.size();
+        for(auto i=0; i<rank; ++i) {
+            _sizes[i] = shp[i];
+            _strides[rank-i-1] = stride;
+            stride *= shp[i];
+        }
     }
 
     void alloc()
     {
         auto esz = sizeof_dtype(_dtype);
         _allocated = new (std::align_val_t(esz)) char[esz*_slice.size()];
         _aligned = _allocated;
+        auto rank = _slice.ndims();
+        _sizes = new intptr_t[rank];
+        _strides = new intptr_t[rank];
         _offset = 0;
     }
 
     ~DDPTensorImpl()
     {
+        delete [] _sizes;
+        delete [] _strides;
     }
 
     void * data()
     {
-        return _aligned;
+        void * ret;
+        dispatch(_dtype, _aligned, [this, &ret](auto * ptr) { ret = ptr + this->_offset; });
+        return ret;
     }
 
     bool is_sliced() const
@@ -90,7 +111,8 @@ class DDPTensorImpl : public tensor_i
         const auto sz = _slice.size();
         std::ostringstream oss;
 
-        dispatch(_dtype, _aligned, [sz, &oss](auto * ptr) {
+        dispatch(_dtype, _aligned, [this, sz, &oss](auto * ptr) {
+            ptr += this->_offset;
             for(auto i=0; i<sz; ++i) {
                 oss << ptr[i] << " ";
             }
@@ -127,7 +149,7 @@ class DDPTensorImpl : public tensor_i
             throw(std::runtime_error("Cast to scalar bool: tensor is not replicated"));
         
         bool res;
-        dispatch(_dtype, _aligned, [&res](auto * ptr) { res = static_cast<bool>(*ptr); });
+        dispatch(_dtype, _aligned, [this, &res](auto * ptr) { res = static_cast<bool>(ptr[this->_offset]); });
         return res;
     }
 
@@ -137,7 +159,7 @@ class DDPTensorImpl : public tensor_i
             throw(std::runtime_error("Cast to scalar float: tensor is not replicated"));
 
         double res;
-        dispatch(_dtype, _aligned, [&res](auto * ptr) { res = static_cast<double>(*ptr); });
+        dispatch(_dtype, _aligned, [this, &res](auto * ptr) { res = static_cast<double>(ptr[this->_offset]); });
         return res;
     }
 
@@ -147,7 +169,7 @@ class DDPTensorImpl : public tensor_i
             throw(std::runtime_error("Cast to scalar int: tensor is not replicated"));
 
         float res;
-        dispatch(_dtype, _aligned, [&res](auto * ptr) { res = static_cast<float>(*ptr); });
+        dispatch(_dtype, _aligned, [this, &res](auto * ptr) { res = static_cast<float>(ptr[this->_offset]); });
         return res;
     }
 
@@ -198,7 +220,18 @@ class DDPTensorImpl : public tensor_i
         auto sz = _slice.size()*item_size();
         buff.resize(pos + sz);
         void * out = buff.data() + pos;
-        memcpy(out, _aligned, sz);
+        dispatch(_dtype, _aligned, [this, sz, out](auto * ptr) { memcpy(out, ptr + this->_offset, sz); });
+    }
+
+    virtual uint64_t store_memref(intptr_t * buff, int rank)
+    {
+        assert(rank == _slice.ndims() || (_slice.ndims() == 1 && _slice.size() == 1));
+        buff[0] = reinterpret_cast<intptr_t>(_allocated);
+        buff[1] = reinterpret_cast<intptr_t>(_aligned);
+        buff[2] = static_cast<intptr_t>(_offset);
+        memcpy(buff+3, _sizes, rank*sizeof(intptr_t));
+        memcpy(buff+3+rank, _strides, rank*sizeof(intptr_t));
+        return 3 + 2*rank;
     }
 };
 
diff --git a/src/include/ddptensor/jit/mlir.hpp b/src/include/ddptensor/jit/mlir.hpp
@@ -33,10 +33,12 @@ class DepManager
 private:
     using IdValueMap = std::unordered_map<id_type, std::pair<::mlir::Value, SetResFunc>>;
     using IdRankMap = std::unordered_map<id_type, int>;
+    using ArgList = std::vector<std::pair<id_type, int>>;
+
     ::mlir::func::FuncOp & _func; // MLIR function to which ops are added
     IdValueMap _ivm;              // guid -> {mlir::Value, deliver-callback}
     IdRankMap _irm;               // guid -> rank as computed in MLIR
-    std::vector<id_type> _args;   // input args to generated function
+    ArgList _args;                // input arguments of the generated function
 
 public:
     DepManager(::mlir::func::FuncOp & f)
@@ -55,11 +57,19 @@ class DepManager
     void drop(id_type guid);
 
     /// create return statement and add results to function
+    /// this must be called after store_inputs
     /// @return size of output in number of intptr_t's 
     uint64_t handleResult(::mlir::OpBuilder & builder);
 
     /// devlier promise after execution
     void deliver(intptr_t *, uint64_t);
+
+    /// @return total size of all input arguments in number of intptr_t
+    uint64_t arg_size();
+
+    /// store all inputs into given buffer
+    /// This must be called before handleResults()
+    std::vector<void*> store_inputs();
 };
 
 // A class to manage the MLIR business (compilation and execution).
@@ -77,7 +87,7 @@ class JIT {
 
     JIT();
     // run
-    int run(::mlir::ModuleOp &, const std::string &, void *);
+    int run(::mlir::ModuleOp &, const std::string &, std::vector<void*> &, intptr_t *);
 
     ::mlir::MLIRContext _context;
     ::mlir::PassManager _pm;
diff --git a/src/include/ddptensor/tensor_i.hpp b/src/include/ddptensor/tensor_i.hpp
@@ -69,6 +69,9 @@ class tensor_i
     virtual void bufferize(const NDSlice & slice, Buffer & buff) const = 0;
     // size of a single element (in bytes)
     virtual int item_size() const = 0;
+    // store tensor information in form of coreesponding jit::JIT::MemRefDescriptor
+    // @return stored size in number of intptr_t
+    virtual uint64_t store_memref(intptr_t * buff, int rank) = 0;
 };
 
 #if 0
diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp
@@ -108,20 +108,48 @@ ::mlir::Value DepManager::getDependent(::mlir::OpBuilder & builder, id_type guid
     if(auto d = _ivm.find(guid); d == _ivm.end()) {
         // Not found -> this must be an input argument to the jit function
         auto idx = _args.size();
-        auto fut = Registry::get(d->first);
+        auto fut = Registry::get(guid);
         auto typ = getPTType(builder, fut.dtype(), fut.rank());
         _func.insertArgument(idx, typ, {}, loc);
         auto val = _func.getArgument(idx);
-        _args.push_back(guid);
+        _args.push_back({guid, fut.rank()});
         _ivm[guid] = {val, {}};
         return val;
     } else {
         return d->second.first;
     }
 }
 
+// size of memreftype in number of intptr_t's
+static inline uint64_t memref_sz(int rank) { return 3 + 2 * rank; }
+
+uint64_t DepManager::arg_size()
+{
+    uint64_t sz = 0;
+    for(auto a : _args) {
+        sz += memref_sz(a.second);
+    }
+    return sz;
+}
+
+std::vector<void*> DepManager::store_inputs()
+{
+    std::vector<void*> res(_args.size());
+    int i = 0;
+    for(auto a : _args) {
+        auto f = Registry::get(a.first);
+        intptr_t * buff = new intptr_t[memref_sz(a.second)];
+        auto sz = f.get().get()->store_memref(buff, a.second);
+        res[i] = buff;
+        _ivm.erase(a.first); // inputs need no delivery
+        ++i;
+    }
+    return res;
+}
+
 void DepManager::addVal(id_type guid, ::mlir::Value val, SetResFunc cb)
 {
+    assert(_ivm.find(guid) == _ivm.end());
     _ivm[guid] = {val, cb};
 }
 
@@ -158,7 +186,7 @@ uint64_t DepManager::handleResult(::mlir::OpBuilder & builder)
         auto rank = ptt.getRtensor().getShape().size();
         _irm[v.first] = rank;
         // add sizeof(MemRefDescriptor<elementtype, rank>) to sz
-        sz += 3 + 2 * rank;
+        sz += memref_sz(rank);
         ++idx;
     }
 
@@ -179,16 +207,17 @@ void DepManager::deliver(intptr_t * output, uint64_t sz)
         intptr_t offset = output[pos+2];
         intptr_t * sizes = output + pos + 3;
         intptr_t * stride = output + pos + 3 + rank;
-        pos += 3 + 2 * rank;
+        pos += memref_sz(rank);
         v.second.second(rank, allocated, aligned, offset, sizes, stride);
     }
 }
 
-int JIT::run(::mlir::ModuleOp & module, const std::string & fname, void * out)
+int JIT::run(::mlir::ModuleOp & module, const std::string & fname, std::vector<void*> & inp, intptr_t * out)
 {    
     if (::mlir::failed(_pm.run(module)))
         throw std::runtime_error("failed to run pass manager");
 
+    module.dump();
     // An optimization pipeline to use within the execution engine.
     auto optPipeline = ::mlir::makeOptimizingTransformer(/*optLevel=*/0,
                                                          /*sizeLevel=*/0,
@@ -202,12 +231,20 @@ int JIT::run(::mlir::ModuleOp & module, const std::string & fname, void * out)
     assert(maybeEngine && "failed to construct an execution engine");
     auto &engine = maybeEngine.get();
 
-
     const char * fn = getenv("DDPT_FN");
     if(!fn) fn = fname.c_str();
 
+    llvm::SmallVector<void *> args;
+    // first arg must be the result ptr
+    args.push_back(&out);
+    // we need a void*& for every input tensor
+    // we refer directly to the storage in inp
+    for(auto & arg : inp) {
+        args.push_back(&arg);
+    }
+
     // Invoke the JIT-compiled function.
-    if(engine->invoke(fn, ::mlir::ExecutionEngine::result(out))) {
+    if(engine->invokePacked(std::string("_mlir_ciface_") + fn, args)) {
         ::llvm::errs() << "JIT invocation failed\n";
         throw std::runtime_error("JIT invocation failed");
     }
@@ -243,6 +280,7 @@ JIT::JIT()
 
 void init()
 {
+    assert(sizeof(intptr_t) == sizeof(void*));
     ::mlir::registerAllPasses();
     ::imex::registerAllPasses();