using latest imex; connecting idtr to enable proper distributed operation

fschlimb · fschlimb · commit 78ac247cd0c4 · 2022-10-07T11:22:30.000-05:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -84,7 +84,8 @@ add_custom_command(
 # ============
 
 FILE(GLOB Hpps ${PROJECT_SOURCE_DIR}/src/include/ddptensor/*.hpp)
-set(Hpps ${Hpps} ${P2C_HPP})
+FILE(GLOB JitHpps ${PROJECT_SOURCE_DIR}/src/include/ddptensor/jit/*.hpp)
+set(Hpps ${Hpps} ${JitHpps} ${P2C_HPP})
 
 set(DDPTSrcs
     ${PROJECT_SOURCE_DIR}/src/ddptensor.cpp
@@ -153,7 +154,11 @@ target_link_libraries(idtr PRIVATE
     ${MPI_C_LIBRARIES}
     # ${MKL_LIBRARIES}
     tbb
-    ${imex_all_libs}
+    IMEXPTensorDialect
+    IMEXPTensorTransforms
+    IMEXPTensorToLinalg
+    IMEXDistDialect
+    IMEXDistToStandard
 	  MLIROptLib
     MLIRExecutionEngine
     MLIRIR
diff --git a/src/Creator.cpp b/src/Creator.cpp
@@ -5,7 +5,13 @@
 #include "ddptensor/DDPTensorImpl.hpp"
 
 #include <imex/Dialect/PTensor/IR/PTensorOps.h>
+#include <imex/internal/PassUtils.h>
+
 #include <mlir/IR/Builders.h>
+#include <mlir/Dialect/Arithmetic/IR/Arithmetic.h>
+#include <mlir/Dialect/Shape/IR/Shape.h>
+#include <mlir/Dialect/Tensor/IR/Tensor.h>
+#include <mlir/Dialect/Linalg/IR/Linalg.h>
 
 #if 0
 namespace x {
@@ -161,17 +167,15 @@ struct DeferredArange : public Deferred
     
     bool generate_mlir(::mlir::OpBuilder & builder, ::mlir::Location loc, jit::DepManager & dm) override
     {
-        // create start, stop and step
-        auto start = jit::createI64(loc, builder, _start);
-        auto end = jit::createI64(loc, builder, _end);
-        auto step = jit::createI64(loc, builder, _step);
-        // create arange
-        auto dtype = builder.getI64Type();
-        assert(_dtype == INT64 || _dtype == UINT64); // FIXME
-        llvm::SmallVector<int64_t> shape(1, -1); //::mlir::ShapedType::kDynamicSize);
-        auto artype = ::imex::ptensor::PTensorType::get(builder.getContext(), ::mlir::RankedTensorType::get(shape, dtype), true);
-        dm.addVal(guid(),
-                  builder.create<::imex::ptensor::ARangeOp>(loc, artype, start, end, step, true),
+        auto start = ::imex::createInt(loc, builder, _start);
+        auto stop = ::imex::createInt(loc, builder, _end);
+        auto step = ::imex::createInt(loc, builder, _step);
+        auto dtype = builder.getI64Type(); // FIXME
+        auto artype = ::imex::ptensor::PTensorType::get(builder.getContext(), ::mlir::RankedTensorType::get({-1}, dtype), false, true);
+        auto dmy = ::imex::createInt<1>(loc, builder, 0);
+        auto team = ::imex::createInt(loc, builder, 1);
+        dm.addVal(this->guid(),
+                  builder.create<::imex::ptensor::ARangeOp>(loc, artype, start, stop, step, dmy, team),
                   [this](uint64_t rank, void *allocated, void *aligned, intptr_t offset, const intptr_t * sizes, const intptr_t * strides) {
             assert(rank == 1);
             assert(strides[0] == 1);
diff --git a/src/Deferred.cpp b/src/Deferred.cpp
@@ -8,6 +8,7 @@
 #include <oneapi/tbb/concurrent_queue.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
 #include <imex/Dialect/PTensor/IR/PTensorOps.h>
+#include <imex/Dialect/Dist/IR/DistOps.h>
 #include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 
 #include <iostream>
@@ -71,10 +72,11 @@ void process_promises()
     
         // Create a MLIR module
         auto module = builder.create<::mlir::ModuleOp>(loc);
-        // Create a func
-        auto dtype = builder.getI64Type();
+        auto protos = builder.create<::imex::dist::RuntimePrototypesOp>(loc);
+        module.push_back(protos);
+        // Create the jit func
         // create dummy type, we'll replace it with the actual type later
-        auto dummyFuncType = builder.getFunctionType({}, dtype);
+        auto dummyFuncType = builder.getFunctionType({}, {});
         std::string fname("ddpt_jit");
         auto function = builder.create<::mlir::func::FuncOp>(loc, fname, dummyFuncType);
         // create function entry block
@@ -109,11 +111,12 @@ void process_promises()
         uint64_t osz = dm.handleResult(builder);
         // also request generation of c-wrapper function
         function->setAttr(::mlir::LLVM::LLVMDialect::getEmitCWrapperAttrName(), ::mlir::UnitAttr::get(&jit._context));
+        function.getFunctionType().dump();
         // add the function to the module
         module.push_back(function);
         module.dump();
 
-        // get input buffers (before rsults!)
+        // get input buffers (before results!)
         auto input = std::move(dm.store_inputs());
 
         // compile and run the module
diff --git a/src/EWBinOp.cpp b/src/EWBinOp.cpp
@@ -13,6 +13,7 @@
 
 #include <imex/Dialect/PTensor/IR/PTensorOps.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/Dialect/Shape/IR/Shape.h>
 
 // #######################################################################################
 // The 2 operators/tensors can have shifted partitions, e.g. local data might not be the
@@ -32,7 +33,7 @@
 //   2. local data which does not need communication
 //   3. Trailing remote data
 //
-// We attempt to minize copies by treating each region explicitly, e.g. data
+// We attempt to minimize copies by treating each region explicitly, e.g. data
 // which is already local will not be copied or communicated.
 //
 // Additionally, to reduce generated code size we convert buffers to the result
@@ -459,10 +460,10 @@ struct DeferredEWBinOp : public Deferred
     bool generate_mlir(::mlir::OpBuilder & builder, ::mlir::Location loc, jit::DepManager & dm) override
     {
         // FIXME the type of the result is based on a only
-        auto a = dm.getDependent(builder, _a);
-        auto b = dm.getDependent(builder, _b);
-        dm.addVal(guid(),
-                  builder.create<::imex::ptensor::EWBinOp>(loc, a.getType(), builder.getI32IntegerAttr(ddpt2mlir(_op)), a, b),
+        auto av = dm.getDependent(builder, _a);
+        auto bv = dm.getDependent(builder, _b);
+        dm.addVal(this->guid(),
+                  builder.create<::imex::ptensor::EWBinOp>(loc, av.getType(), builder.getI32IntegerAttr(ddpt2mlir(_op)), av, bv),
                   [this](uint64_t rank, void *allocated, void *aligned, intptr_t offset, const intptr_t * sizes, const intptr_t * strides) {
             this->set_value(std::move(mk_tnsr(_dtype, rank, allocated, aligned, offset, sizes, strides)));
         });
diff --git a/src/ReduceOp.cpp b/src/ReduceOp.cpp
@@ -8,6 +8,7 @@
 
 #include <imex/Dialect/PTensor/IR/PTensorOps.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/Dialect/Shape/IR/Shape.h>
 
 #if 0
 namespace x {
@@ -119,17 +120,17 @@ struct DeferredReduceOp : public Deferred
     bool generate_mlir(::mlir::OpBuilder & builder, ::mlir::Location loc, jit::DepManager & dm) override
     {
         // FIXME reduction over individual dimensions is not supported
-        auto a = dm.getDependent(builder, _a);
-        auto a_ptt = a.getType().dyn_cast<::imex::ptensor::PTensorType>();
-        assert(a_ptt);
-        
-        auto rtyp = ::imex::ptensor::PTensorType::get(
-            builder.getContext(), 
-            ::mlir::RankedTensorType::get(llvm::SmallVector<int64_t>(), a_ptt.getRtensor().getElementType()),
-            true
-        );
-        dm.addVal(guid(),
-                  builder.create<::imex::ptensor::ReductionOp>(loc, rtyp, builder.getI32IntegerAttr(ddpt2mlir(_op)), a),
+        auto av = dm.getDependent(builder, _a);
+        auto aPtTyp = av.getType().dyn_cast<::imex::ptensor::PTensorType>();
+        assert(aPtTyp);
+        // return type 0d with same dtype as input
+        auto dtype = aPtTyp.getRtensor().getElementType();
+        auto retPtTyp = ::imex::ptensor::PTensorType::get(builder.getContext(), ::mlir::RankedTensorType::get({}, dtype), false, true);
+        // reduction op
+        auto mop = ddpt2mlir(_op);
+        auto op = builder.getIntegerAttr(builder.getIntegerType(sizeof(mop)*8), mop);
+        dm.addVal(this->guid(),
+                  builder.create<::imex::ptensor::ReductionOp>(loc, retPtTyp, op, av),
                   [this](uint64_t rank, void *allocated, void *aligned, intptr_t offset, const intptr_t * sizes, const intptr_t * strides) {
             this->set_value(std::move(mk_tnsr(_dtype, rank, allocated, aligned, offset, sizes, strides)));
         });
diff --git a/src/Service.cpp b/src/Service.cpp
@@ -104,7 +104,8 @@ ddptensor * Service::replicate(const ddptensor & a)
 
 void Service::run()
 {
-    defer_lambda([](){ return true; });
+    defer<DeferredService>(DeferredService::RUN);
+    // defer_lambda([](){ return true; });
 }
 
 bool inited = false;
diff --git a/src/idtr.cpp b/src/idtr.cpp
@@ -4,39 +4,64 @@
 #include <ddptensor/DDPTensorImpl.hpp>
 #include <ddptensor/MPITransceiver.hpp>
 
+#include <imex/Dialect/PTensor/IR/PTensorOps.h>
+
 #include <cassert>
 #include <memory>
 
 using container_type = std::unordered_map<id_type, std::unique_ptr<DDPTensorImpl>>;
 
 static container_type gtensors;
+static id_type _nguid = -1;
+inline id_type get_guid()
+{
+    return ++_nguid;
+}
 
 // Transceiver * theTransceiver = MPITransceiver();
 
+template<typename T>
+T * mr_to_ptr(void * ptr, intptr_t offset)
+{
+    auto mr = reinterpret_cast<intptr_t*>(ptr);
+    return reinterpret_cast<T*>(ptr) + offset; // &mr.aligned[mr.offset]
+}
+
 extern "C" {
 
 // Register a global tensor of given shape.
-// Accepts a guid which might have been reserved before. Returns guid (reserved or new).
+// Returns guid.
 // The runtime does not own or manage any memory.
-id_t idtr_init_dtensor(const uint64_t * shape, uint64_t N, id_t guid)
+id_t idtr_init_dtensor(const uint64_t * shape, uint64_t nD)
 {
-    assert(guid != UNKNOWN_GUID);
-    gtensors[guid] = std::unique_ptr<DDPTensorImpl>(new DDPTensorImpl(shape, N));
+    auto guid = get_guid();
+    gtensors[guid] = std::unique_ptr<DDPTensorImpl>(nD ? new DDPTensorImpl(shape, nD) : new DDPTensorImpl);
     return guid;
 }
 
+id_t _idtr_init_dtensor(void * alloced, void * aligned, intptr_t offset, intptr_t size, intptr_t stride, uint64_t nD)
+{
+    return idtr_init_dtensor(mr_to_ptr<uint64_t>(aligned, offset), nD);
+}
+
 // Get the offsets (one for each dimension) of the local partition of a distributed tensor in number of elements.
 // Result is stored in provided array.
-void idtr_local_offsets(id_t guid, uint64_t * offsets, uint64_t N)
+void idtr_local_offsets(id_t guid, uint64_t * offsets, uint64_t nD)
 {
     const auto & tnsr = gtensors.at(guid);
     auto slcs = tnsr->slice().local_slice().slices();
+    assert(nD == slcs.size());
     int i = -1;
     for(auto s : slcs) {
         offsets[++i] = s._start;
     }
 }
 
+void _idtr_local_offsets(id_t guid, void * alloced, void * aligned, intptr_t offset, intptr_t size, intptr_t stride, uint64_t nD)
+{
+    idtr_local_offsets(guid, mr_to_ptr<uint64_t>(aligned, offset), nD);
+}
+
 // Get the shape (one size for each dimension) of the local partition of a distributed tensor in number of elements.
 // Result is stored in provided array.
 void idtr_local_shape(id_t guid, uint64_t * lshape, uint64_t N)
@@ -46,10 +71,45 @@ void idtr_local_shape(id_t guid, uint64_t * lshape, uint64_t N)
     std::copy(shp.begin(), shp.end(), lshape);
 }
 
+void _idtr_local_shape(id_t guid, void * alloced, void * aligned, intptr_t offset, intptr_t size, intptr_t stride, uint64_t nD)
+{
+    idtr_local_shape(guid, mr_to_ptr<uint64_t>(aligned, offset), nD);
+}
+
+// convert id of our reduction op to id of imex::ptensor reduction op
+static ReduceOpId mlir2ddpt(const ::imex::ptensor::ReduceOpId rop)
+{
+    switch(rop) {
+    case ::imex::ptensor::MEAN:
+        return MEAN;
+    case ::imex::ptensor::PROD:
+        return PROD;
+    case ::imex::ptensor::SUM:
+        return SUM;
+    case ::imex::ptensor::STD:
+        return STD;
+    case ::imex::ptensor::VAR:
+        return VAR;
+    case ::imex::ptensor::MAX:
+        return MAX;
+    case MIN:
+        return MIN;
+    default:
+        throw std::runtime_error("Unknown reduction operation");
+    }
+}
+
 // Elementwise inplace allreduce
-void idtr_reduce_all(void * inout, DTypeId dtype, size_t N, RedOpType op)
+void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, int op)
+{
+
+    getTransceiver()->reduce_all(inout, dtype, N, mlir2ddpt(static_cast<imex::ptensor::ReduceOpId>(op)));
+}
+
+// FIXME hard-coded 0d tensor
+void _idtr_reduce_all(uint64_t * allocated, uint64_t * aligned, uint64_t offset, DTypeId dtype, int op)
 {
-    getTransceiver()->reduce_all(inout, dtype, N, op);
+    idtr_reduce_all(aligned + offset, dtype, 1, op);
 }
 
 } // extern "C"
diff --git a/src/include/ddptensor/DDPTensorImpl.hpp b/src/include/ddptensor/DDPTensorImpl.hpp
@@ -21,12 +21,12 @@ class DDPTensorImpl : public tensor_i
 {
     mutable rank_type _owner;
     PVSlice _slice;
-    void * _allocated;
-    void * _aligned;
-    intptr_t * _sizes;
-    intptr_t * _strides;
-    uint64_t _offset;
-    DTypeId _dtype;
+    void * _allocated = nullptr;
+    void * _aligned = nullptr;
+    intptr_t * _sizes = nullptr;
+    intptr_t * _strides = nullptr;
+    uint64_t _offset = 0;
+    DTypeId _dtype = DTYPE_LAST;
 
 public:
     using ptr_type = std::shared_ptr<DDPTensorImpl>;
@@ -54,9 +54,6 @@ class DDPTensorImpl : public tensor_i
     DDPTensorImpl(DTypeId dtype, const shape_type & shp, rank_type owner=NOOWNER)
         : _owner(owner),
           _slice(shp, static_cast<int>(owner==REPLICATED ? NOSPLIT : 0)),
-          _allocated(nullptr),
-          _aligned(nullptr),
-          _offset(0),
           _dtype(dtype)
     {
         alloc();
@@ -73,11 +70,14 @@ class DDPTensorImpl : public tensor_i
     // incomplete, useful for computing meta information
     DDPTensorImpl(const uint64_t * shape, uint64_t N, rank_type owner=NOOWNER)
         : _owner(owner),
-          _slice(shape_type(shape, shape+N), static_cast<int>(owner==REPLICATED ? NOSPLIT : 0)),
-          _allocated(nullptr),
-          _aligned(nullptr),
-          _offset(0),
-          _dtype(DTYPE_LAST)
+          _slice(shape_type(shape, shape+N), static_cast<int>(owner==REPLICATED ? NOSPLIT : 0))
+    {
+    }
+
+    // incomplete, useful for computing meta information
+    DDPTensorImpl()
+        : _owner(REPLICATED),
+          _slice(shape_type(), static_cast<int>(NOSPLIT))
     {
     }
 
diff --git a/src/include/ddptensor/idtr.hpp b/src/include/ddptensor/idtr.hpp
@@ -11,7 +11,7 @@ extern "C" {
     // Register a global tensor of given shape.
     // Accepts a guid which might have been reserved before. Returns guid (reserved or new).
     // The runtime does not own or manage any memory.
-    id_t idtr_nit_dtensor(const uint64_t * shape, uint64_t N, id_t guid = UNKNOWN_GUID);
+    id_t idtr_init_dtensor(const uint64_t * shape, uint64_t N);
 
     // Get the offsets (one for each dimension) of the local partition of a distributed tensor in number of elements.
     // Result is stored in provided array.
@@ -22,6 +22,6 @@ extern "C" {
     void idtr_local_shape(id_t guid, uint64_t * lshape, uint64_t N);
 
     // Elementwise inplace allreduce
-    void idtr_reduce_all(void * inout, DTypeId T, size_t N, RedOpType op);
-
+    void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, int op);
+    
 } // extern "C"
diff --git a/src/include/ddptensor/jit/mlir.hpp b/src/include/ddptensor/jit/mlir.hpp
diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,8 @@ ddptensor * Service::replicate(const ddptensor & a)`
`104`	`104`
`105`	`105`	`void Service::run()`
`106`	`106`	`{`
`107`		`- defer_lambda([](){ return true; });`
	`107`	`+ defer<DeferredService>(DeferredService::RUN);`
	`108`	`+ // defer_lambda([](){ return true; });`
`108`	`109`	`}`
`109`	`110`
`110`	`111`	`bool inited = false;`