correctly handling 0d tensors and 1-element tensors and various element types

fschlimb · fschlimb · commit b303f2f8c056 · 2022-12-20T11:04:39.000-06:00
diff --git a/ddptensor/__init__.py b/ddptensor/__init__.py
@@ -62,13 +62,21 @@ def to_numpy(a):
 
 for func in api.api_categories["Creator"]:
     FUNC = func.upper()
-    if func in ["empty", "ones", "zeros",]:
+    if func == "full":
         exec(
-            f"{func} = lambda shape, dtype: dtensor(_cdt.Creator.create_from_shape(_cdt.{FUNC}, shape, dtype))"
+            f"{func} = lambda shape, val, dtype: dtensor(_cdt.Creator.full(shape, val, dtype))"
         )
-    elif func == "full":
+    elif func == "empty":
         exec(
-            f"{func} = lambda shape, val, dtype: dtensor(_cdt.Creator.full(shape, val, dtype))"
+            f"{func} = lambda shape, dtype: dtensor(_cdt.Creator.full(shape, None, dtype))"
+        )
+    elif func == "ones":
+        exec(
+            f"{func} = lambda shape, dtype: dtensor(_cdt.Creator.full(shape, 1, dtype))"
+        )
+    elif func == "zeros":
+        exec(
+            f"{func} = lambda shape, dtype: dtensor(_cdt.Creator.full(shape, 0, dtype))"
         )
     elif func == "arange":
         exec(
diff --git a/src/DDPTensorImpl.cpp b/src/DDPTensorImpl.cpp
@@ -16,8 +16,6 @@ DDPTensorImpl::DDPTensorImpl(DTypeId dtype, uint64_t ndims,
     : _owner(owner),
       _allocated(allocated),
       _aligned(aligned),
-      _sizes(new intptr_t[ndims]),
-      _strides(new intptr_t[ndims]),
       _gs_allocated(gs_allocated),
       _gs_aligned(gs_aligned),
       _lo_allocated(lo_allocated),
@@ -26,8 +24,15 @@ DDPTensorImpl::DDPTensorImpl(DTypeId dtype, uint64_t ndims,
       _ndims(ndims),
       _dtype(dtype)
 {
-    memcpy(_sizes, sizes, ndims*sizeof(*_sizes));
-    memcpy(_strides, strides, ndims*sizeof(*_strides));
+    if(ndims > 0) {
+        _sizes = new intptr_t[ndims];
+        _strides = new intptr_t[ndims];
+        memcpy(_sizes, sizes, ndims*sizeof(*_sizes));
+        memcpy(_strides, strides, ndims*sizeof(*_strides));
+    } else {
+        _owner = REPLICATED;
+        assert(_aligned);
+    }
 }
 
 DDPTensorImpl::DDPTensorImpl(DTypeId dtype, const shape_type & shp, rank_type owner)
@@ -72,15 +77,17 @@ DDPTensorImpl::ptr_type DDPTensorImpl::clone(bool copy)
                                            gs_allocated, gs_aligned, lo_allocated, lo_aligned, owner());
 }
 
-void DDPTensorImpl::alloc()
+void DDPTensorImpl::alloc(bool all)
 {
     auto esz = sizeof_dtype(_dtype);
-    _allocated = new (std::align_val_t(esz)) char[esz*size()];
+    _allocated = new (std::align_val_t(esz)) char[esz*local_size()];
     _aligned = _allocated;
-    auto nds = ndims();
-    _sizes = new intptr_t[nds];
-    _strides = new intptr_t[nds];
     _offset = 0;
+    if(all) {
+        auto nds = ndims();
+        _sizes = new intptr_t[nds];
+        _strides = new intptr_t[nds];
+    }
 }
 
 void * DDPTensorImpl::data()
@@ -106,8 +113,11 @@ std::string DDPTensorImpl::__repr__() const
 
     dispatch(_dtype, _aligned, [this, nd, &oss](auto * ptr) {
         auto cptr = ptr + this->_offset;
-        if(nd>0) printit(oss, 0, cptr);
-        else oss << *cptr;
+        if(nd>0) {
+            printit(oss, 0, cptr);
+        } else {
+            oss << *cptr;
+        }
     });
     return oss.str();
 }
@@ -189,3 +199,26 @@ void DDPTensorImpl::add_to_args(std::vector<void*> & args, int ndims)
     buff[4] = 1;
     args.push_back(buff);
 }
+
+void DDPTensorImpl::replicate()
+{
+    if(is_replicated()) return;
+    auto gsz = size();
+    auto lsz = local_size();
+    if(gsz > 1) throw(std::runtime_error("Replication implemented for single-element tensors only."));
+    if(lsz != gsz) {
+        assert(lsz == 0);
+        auto nd = ndims();
+        for(auto i=0; i<nd; ++i) {
+            _sizes[i] = _strides[i] = 1;
+        }
+        _sizes[nd-1] = gsz;
+    }
+    dispatch(_dtype, _aligned, [this, lsz, gsz](auto * ptr) {
+        auto tmp = ptr[this->_offset];
+        if(lsz != gsz) ptr[this->_offset] = 0;
+        getTransceiver()->reduce_all(&ptr[this->_offset], this->_dtype, 1, SUM);
+        assert(lsz != gsz || tmp == ptr[this->_offset]);
+    });
+    set_owner(REPLICATED);
+}
diff --git a/src/Service.cpp b/src/Service.cpp
@@ -52,20 +52,20 @@ struct DeferredService : public Deferred
 
     void run()
     {
-#if 0
         switch(_op) {
         case REPLICATE: {
             const auto a = std::move(Registry::get(_a).get());
-            set_value(std::move(TypeDispatch<x::Service>(a)));
+            auto ddpt = dynamic_cast<DDPTensorImpl*>(a.get());
+            assert(ddpt);
+            ddpt->replicate();
+            set_value(a);
             break;
         }
-        case DROP:
-            Registry::del(_a);
+        case RUN:
             break;
         default:
-                throw(std::runtime_error("Unkown Service operation requested."));
+           throw(std::runtime_error("Unkown Service operation requested."));
         }
-#endif
     }
 
     bool generate_mlir(::mlir::OpBuilder & builder, ::mlir::Location loc, jit::DepManager & dm) override
@@ -76,6 +76,7 @@ struct DeferredService : public Deferred
             // FIXME create delete op and return it
             break;
         case RUN:
+        case REPLICATE:
             return true;
         default:
             throw(std::runtime_error("Unkown Service operation requested."));
diff --git a/src/idtr.cpp b/src/idtr.cpp
@@ -94,6 +94,7 @@ void _idtr_local_shape(id_t guid, void * alloced, void * aligned, intptr_t offse
 {
     idtr_local_shape(guid, mr_to_ptr<uint64_t>(aligned, offset), nD);
 }
+} // extern "C"
 
 // convert id of our reduction op to id of imex::ptensor reduction op
 static ReduceOpId mlir2ddpt(const ::imex::ptensor::ReduceOpId rop)
@@ -118,17 +119,61 @@ static ReduceOpId mlir2ddpt(const ::imex::ptensor::ReduceOpId rop)
     }
 }
 
+static DTypeId mlir2ddpt(const ::imex::ptensor::DType dt)
+{
+    switch(dt) {
+    case ::imex::ptensor::DType::F64:
+        return FLOAT64;
+        break;
+    case ::imex::ptensor::DType::I64:
+        return INT64;
+        break;
+    case ::imex::ptensor::DType::U64:
+        return UINT64;
+        break;
+    case ::imex::ptensor::DType::F32:
+        return FLOAT32;
+        break;
+    case ::imex::ptensor::DType::I32:
+        return INT32;
+        break;
+    case ::imex::ptensor::DType::U32:
+        return UINT32;
+        break;
+    case ::imex::ptensor::DType::I16:
+        return INT16;
+        break;
+    case ::imex::ptensor::DType::U16:
+        return UINT16;
+        break;
+    case ::imex::ptensor::DType::I8:
+        return INT8;
+        break;
+    case ::imex::ptensor::DType::U8:
+        return UINT8;
+        break;
+    case ::imex::ptensor::DType::I1:
+        return BOOL;
+        break;
+    default:
+        throw std::runtime_error("unknown dtype");
+    };
+}
+
+extern "C" {
 // Elementwise inplace allreduce
-void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, int op)
+void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, ReduceOpId op)
 {
-    getTransceiver()->reduce_all(inout, dtype, N, mlir2ddpt(static_cast<imex::ptensor::ReduceOpId>(op)));
+    getTransceiver()->reduce_all(inout, dtype, N, op);
 }
 
 // FIXME hard-coded for contiguous layout
-void _idtr_reduce_all(uint64_t rank, void * data, int64_t * sizes, int64_t * strides, DTypeId dtype, int op)
+void _idtr_reduce_all(uint64_t rank, void * data, int64_t * sizes, int64_t * strides, int dtype, int op)
 {
     assert(rank == 0 || strides[rank-1] == 1);
-    idtr_reduce_all(data, dtype, rank ? rank : 1, op);
+    idtr_reduce_all(data,
+                    mlir2ddpt(static_cast<::imex::ptensor::DType>(dtype)),
+                    rank ? rank : 1,
+                    mlir2ddpt(static_cast<imex::ptensor::ReduceOpId>(op)));
 }
-
 } // extern "C"
diff --git a/src/include/ddptensor/CppTypes.hpp b/src/include/ddptensor/CppTypes.hpp
@@ -22,6 +22,12 @@ using InputAdapter = bitsery::InputBufferAdapter<Buffer>;
 using Serializer = bitsery::Serializer<OutputAdapter>;
 using Deserializer = bitsery::Deserializer<InputAdapter>;
 
+union PyScalar
+{
+    int64_t _int;
+    double _float;
+};
+
 enum _RANKS: rank_type {
     NOOWNER    = std::numeric_limits<rank_type>::max(),
     REPLICATED = std::numeric_limits<rank_type>::max() - 1,
@@ -42,17 +48,17 @@ template<> struct DTYPE<uint8_t>  { constexpr static DTypeId value = UINT8; };
 template<> struct DTYPE<bool>     { constexpr static DTypeId value = BOOL; };
 
 template<DTypeId DT> struct TYPE {};
-template<> struct TYPE<FLOAT64> { using dtype = double; };
-template<> struct TYPE<FLOAT32> { using dtype = float; };
-template<> struct TYPE<INT64>   { using dtype = int64_t; };
-template<> struct TYPE<INT32>   { using dtype = int32_t; };
-template<> struct TYPE<INT16>   { using dtype = int16_t; };
-template<> struct TYPE<INT8>    { using dtype = int8_t; };
-template<> struct TYPE<UINT64>  { using dtype = uint64_t; };
-template<> struct TYPE<UINT32>  { using dtype = uint32_t; };
-template<> struct TYPE<UINT16>  { using dtype = uint16_t; };
-template<> struct TYPE<UINT8>   { using dtype = uint8_t; };
-template<> struct TYPE<BOOL>    { using dtype = bool; };
+template<> struct TYPE<FLOAT64> { using dtype = double; static constexpr bool is_integral = false; static constexpr bool is_float = true; };
+template<> struct TYPE<FLOAT32> { using dtype = float; static constexpr bool is_integral = false; static constexpr bool is_float = true; };
+template<> struct TYPE<INT64>   { using dtype = int64_t; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
+template<> struct TYPE<INT32>   { using dtype = int32_t; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
+template<> struct TYPE<INT16>   { using dtype = int16_t; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
+template<> struct TYPE<INT8>    { using dtype = int8_t; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
+template<> struct TYPE<UINT64>  { using dtype = uint64_t; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
+template<> struct TYPE<UINT32>  { using dtype = uint32_t; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
+template<> struct TYPE<UINT16>  { using dtype = uint16_t; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
+template<> struct TYPE<UINT8>   { using dtype = uint8_t; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
+template<> struct TYPE<BOOL>    { using dtype = bool; static constexpr bool is_integral = true; static constexpr bool is_float = false; };
 
 static size_t sizeof_dtype(const DTypeId dt) {
     switch(dt) {
diff --git a/src/include/ddptensor/DDPTensorImpl.hpp b/src/include/ddptensor/DDPTensorImpl.hpp
@@ -61,7 +61,7 @@ class DDPTensorImpl : public tensor_i
 
     DDPTensorImpl::ptr_type clone(bool copy = true);
 
-    void alloc();
+    void alloc(bool all = true);
 
     ~DDPTensorImpl()
     {
@@ -100,11 +100,16 @@ class DDPTensorImpl : public tensor_i
     {
         switch(ndims()) {
             case 0 : return 1;
-            case 1 : return *_sizes;
-            default: return std::accumulate(_sizes, _sizes+ndims(), 1, std::multiplies<intptr_t>());
+            case 1 : return *_gs_aligned;
+            default: return std::accumulate(_gs_aligned, _gs_aligned+ndims(), 1, std::multiplies<intptr_t>());
         }
     }
 
+    uint64_t local_size() const
+    {
+        return ndims() == 0 ? 0 : std::accumulate(_sizes, _sizes+ndims(), 1, std::multiplies<intptr_t>());
+    }
+
     friend struct Service;
 
     virtual bool __bool__() const;
@@ -113,7 +118,7 @@ class DDPTensorImpl : public tensor_i
 
     virtual uint64_t __len__() const
     {
-        return ndims() ? *_sizes : 0;
+        return ndims() ? *_gs_aligned : 1;
     }
 
     bool has_owner() const
@@ -167,6 +172,8 @@ class DDPTensorImpl : public tensor_i
             oss << "]";
         }
     }
+
+    void replicate();
 };
 
 template<typename ...Ts>
diff --git a/src/include/ddptensor/PyTypes.hpp b/src/include/ddptensor/PyTypes.hpp
@@ -10,12 +10,6 @@ namespace py = pybind11;
 
 template<typename T> py::object get_impl_dtype() { return get_impl_dtype(DTYPE<T>::value); };
 
-union PyScalar
-{
-    int64_t _int;
-    double _float;
-};
-
 inline PyScalar mk_scalar(const py::object & b, DTypeId dtype)
 {
     PyScalar s;
diff --git a/src/include/ddptensor/idtr.hpp b/src/include/ddptensor/idtr.hpp
@@ -28,6 +28,6 @@ extern "C" {
     void idtr_local_shape(id_t guid, uint64_t * lshape, uint64_t N);
 
     // Elementwise inplace allreduce
-    void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, int op);
+    void idtr_reduce_all(void * inout, DTypeId dtype, uint64_t N, ReduceOpId op);
     
 } // extern "C"
diff --git a/test/test_ewb.py b/test/test_ewb.py
@@ -1,14 +1,18 @@
 import ddptensor as dt
 import numpy as np
 
+mpi_dtypes = [dt.float64, dt.float32, dt.int64, dt.uint64, dt.int32, dt.uint32, dt.int8, dt.uint8]
+
 class TestEWB:
     def test_add1(self):
-        a = dt.ones([16,16], dtype=dt.float64)
-        b = dt.ones([16,16], dtype=dt.float64)
-        c = a + b
-        r1 = dt.sum(c, [0,1])
-        v = 16*16*2
-        assert float(r1) == v
+        for dtyp in mpi_dtypes:
+            print(dtyp)
+            a = dt.ones([6,6], dtype=dtyp)
+            b = dt.ones([6,6], dtype=dtyp)
+            c = a + b
+            r1 = dt.sum(c, [0,1])
+            v = 6*6*2
+            assert float(r1) == v
 
     def test_add2(self):
         a = dt.ones([16,16], dtype=dt.float64)

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ class DDPTensorImpl : public tensor_i`
`61`	`61`
`62`	`62`	`DDPTensorImpl::ptr_type clone(bool copy = true);`
`63`	`63`
`64`		`- void alloc();`
	`64`	`+ void alloc(bool all = true);`
`65`	`65`
`66`	`66`	`~DDPTensorImpl()`
`67`	`67`	`{`
`@@ -100,11 +100,16 @@ class DDPTensorImpl : public tensor_i`
`100`	`100`	`{`
`101`	`101`	`switch(ndims()) {`
`102`	`102`	`case 0 : return 1;`
`103`		`- case 1 : return *_sizes;`
`104`		`- default: return std::accumulate(_sizes, _sizes+ndims(), 1, std::multiplies<intptr_t>());`
	`103`	`+ case 1 : return *_gs_aligned;`
	`104`	`+ default: return std::accumulate(_gs_aligned, _gs_aligned+ndims(), 1, std::multiplies<intptr_t>());`
`105`	`105`	`}`
`106`	`106`	`}`
`107`	`107`
	`108`	`+ uint64_t local_size() const`
	`109`	`+ {`
	`110`	`+ return ndims() == 0 ? 0 : std::accumulate(_sizes, _sizes+ndims(), 1, std::multiplies<intptr_t>());`
	`111`	`+ }`
	`112`	`+`
`108`	`113`	`friend struct Service;`
`109`	`114`
`110`	`115`	`virtual bool __bool__() const;`
`@@ -113,7 +118,7 @@ class DDPTensorImpl : public tensor_i`
`113`	`118`
`114`	`119`	`virtual uint64_t __len__() const`
`115`	`120`	`{`
`116`		`- return ndims() ? *_sizes : 0;`
	`121`	`+ return ndims() ? *_gs_aligned : 1;`
`117`	`122`	`}`
`118`	`123`
`119`	`124`	`bool has_owner() const`
`@@ -167,6 +172,8 @@ class DDPTensorImpl : public tensor_i`
`167`	`172`	`oss << "]";`
`168`	`173`	`}`
`169`	`174`	`}`
	`175`	`+`
	`176`	`+ void replicate();`
`170`	`177`	`};`
`171`	`178`
`172`	`179`	`template<typename ...Ts>`