adding basic vecdot/matmul

fschlimb · fschlimb · commit 157888f411b6 · 2022-03-11T05:04:51.000-06:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -21,6 +21,7 @@ find_package(MPI REQUIRED)
 #find_package(OpenMP)
 
 set(MKL_LIBRARIES -L$ENV{MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lrt -ldl -lm)
+#set(CMAKE_INSTALL_RPATH $ENV{MKLROOT}/lib)
 # Use -fPIC even if statically compiled
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
@@ -41,7 +42,7 @@ set(MyCppSources ${MyCppSources} ${P2C_HPP})
 
 pybind11_add_module(_ddptensor MODULE ${MyCppSources})
 
-target_compile_definitions(_ddptensor PRIVATE USE_MKL=1 XTENSOR_USE_XSIMD=1 XTENSOR_USE_OPENMP=1 DDPT_2TYPES=1)
+target_compile_definitions(_ddptensor PRIVATE XTENSOR_USE_XSIMD=1 XTENSOR_USE_OPENMP=1 DDPT_2TYPES=1 USE_MKL=1)
 target_include_directories(_ddptensor PRIVATE
   ${PROJECT_SOURCE_DIR}/src/include
   ${PROJECT_SOURCE_DIR}/third_party/xtl/include
diff --git a/ddptensor/__init__.py b/ddptensor/__init__.py
@@ -73,3 +73,18 @@
         exec(
             f"{func} = lambda this, shape: dtensor(_cdt.ManipOp.reshape(this._t, shape))"
         )
+
+for func in api.api_categories["LinAlgOp"]:
+    FUNC = func.upper()
+    if func in ["tensordot", "vecdot",]:
+        exec(
+            f"{func} = lambda this, other, axis: dtensor(_cdt.LinAlgOp.{func}(this._t, other._t, axis))"
+        )
+    elif func == "matmul":
+        exec(
+            f"{func} = lambda this, other: dtensor(_cdt.LinAlgOp.vecdot(this._t, other._t, 0))"
+        )
+    elif func == "matrix_transpose":
+        exec(
+            f"{func} = lambda this: dtensor(_cdt.LinAlgOp.{func}(this._t))"
+        )
diff --git a/ddptensor/array_api.py b/ddptensor/array_api.py
@@ -183,6 +183,13 @@
         "squeeze",  # (x, /, axis)
         "stack",  # (arrays, /, *, axis=0)
     ],
+
+    "LinAlgOp" : [
+        "matmul",  # (x1, x2, /)
+        "matrix_transpose",   # (x, /)
+        "tensordot",  # (x1, x2, /, *, axes=2)
+        "vecdot",  # (x1, x2, /, *, axis=-1)
+    ],
 })
 
 misc_methods = [
diff --git a/ddptensor/random.py b/ddptensor/random.py
@@ -1,7 +1,8 @@
 from . import _ddptensor as _cdt
+from . import float64
 from . ddptensor import dtensor
 
-def uniform(low, high, size, dtype=_cdt.float64):
+def uniform(low, high, size, dtype=float64):
     return dtensor(_cdt.Random.uniform(dtype, size, low, high))
 
 def seed(s):
diff --git a/src/EWBinOp.cpp b/src/EWBinOp.cpp
@@ -1,4 +1,5 @@
 #include "ddptensor/EWBinOp.hpp"
+#include "ddptensor/LinAlgOp.hpp"
 #include "ddptensor/TypeDispatch.hpp"
 #include "ddptensor/x.hpp"
 
@@ -17,14 +18,14 @@ namespace x {
             if(a_ptr->is_sliced() || b_ptr->is_sliced()) {
                 const auto & av = xt::strided_view(ax, a_ptr->lslice());
                 const auto & bv = xt::strided_view(bx, b_ptr->lslice());
-                return do_op(bop, av, bv, a_ptr);
+                return do_op(bop, av, bv, a_ptr, b_ptr);
             }
-            return do_op(bop, ax, bx, a_ptr);
+            return do_op(bop, ax, bx, a_ptr, b_ptr);
         }
 
 #pragma GCC diagnostic ignored "-Wswitch"
-        template<typename T1, typename T2, typename A>
-        static ptr_type do_op(EWBinOpId bop, const T1 & a, const T2 & b, const std::shared_ptr<DPTensorX<A>> & a_ptr)
+        template<typename T1, typename T2, typename A, typename B>
+        static ptr_type do_op(EWBinOpId bop, const T1 & a, const T2 & b, const std::shared_ptr<DPTensorX<A>> & a_ptr, const std::shared_ptr<DPTensorX<B>> & b_ptr)
         {
             switch(bop) {
             case __ADD__:
@@ -73,13 +74,20 @@ namespace x {
             case __RTRUEDIV__:
                 return operatorx<A>::mk_tx_(a_ptr, b / a);
             case __MATMUL__:
+                return LinAlgOp::vecdot(a_ptr, b_ptr, 0);
             case __POW__:
             case POW:
+                return operatorx<A>::mk_tx_(a_ptr, xt::pow(a, b));
             case __RPOW__:
+                return operatorx<A>::mk_tx_(a_ptr, xt::pow(b, a));
             case LOGADDEXP:
+                return operatorx<A>::mk_tx_(a_ptr, xt::log(xt::exp(a) + xt::exp(b)));
             case LOGICAL_AND:
+                // return operatorx<A>::mk_tx_(a_ptr, a && b);
             case LOGICAL_OR:
+                // return operatorx<A>::mk_tx_(a_ptr, a || b);
             case LOGICAL_XOR:
+                // return operatorx<A>::mk_tx_(a_ptr, xt::not_equal(!a, !b));
                 // FIXME
                 throw std::runtime_error("Binary operation not implemented");
             }
diff --git a/src/LinAlgOp.cpp b/src/LinAlgOp.cpp
@@ -0,0 +1,114 @@
+#include <mpi.h>
+//#include <mkl.h>
+#include "ddptensor/LinAlgOp.hpp"
+#include "ddptensor/TypeDispatch.hpp"
+#include "ddptensor/x.hpp"
+#include "xtensor-blas/xlinalg.hpp"
+
+namespace x {
+
+    template<typename T> struct TGEMM;
+    template<> struct TGEMM<double> { static constexpr auto tgemm = cblas_dgemm; };
+    template<> struct TGEMM<float> { static constexpr auto tgemm = cblas_sgemm; };
+
+    class LinAlgOp
+    {
+    public:
+        using ptr_type = DPTensorBaseX::ptr_type;
+
+        template<typename A, typename B>
+        static ptr_type op(int axis, const std::shared_ptr<DPTensorX<A>> & a_ptr, const std::shared_ptr<DPTensorX<B>> & b_ptr)
+        {
+            if constexpr (std::is_floating_point<A>::value && std::is_same<A, B>::value) {
+                const auto & ax = a_ptr->xarray();
+                const auto & bx = b_ptr->xarray();
+                auto nda = a_ptr->slice().ndims();
+                auto ndb = b_ptr->slice().ndims();
+                
+                if(a_ptr->is_sliced() || b_ptr->is_sliced()) {
+                    if(nda != 1 || ndb != 1)
+                        throw(std::runtime_error("vecdoc on sliced tensors supported for 1d tensors only"));
+                    const auto & av = xt::strided_view(ax, a_ptr->lslice());
+                    const auto & bv = xt::strided_view(bx, b_ptr->lslice());
+                    return vecdot_1d(av, bv, axis);
+                }
+                
+                if(nda == 1 && ndb == 1) {
+                    return vecdot_1d(ax, bx, axis);
+                } else if(nda == 2 && ndb == 2) {
+                    return matmul_2d(a_ptr, b_ptr, axis);
+                }
+                throw(std::runtime_error("'vecdot' supported for two 1d or two 2d tensors only."));
+            } else
+                throw(std::runtime_error("'vecdot' supported for 2 double or float tensors only."));
+        }
+
+        template<typename T1, typename T2>
+        static ptr_type vecdot_1d(const T1 & a, const T2 & b, int axis)
+        {
+            auto d = xt::linalg::dot(a, b)();
+            theTransceiver->reduce_all(&d, DTYPE<decltype(d)>::value, 1, SUM);
+            return operatorx<decltype(d)>::mk_tx(d, REPLICATED);
+        }
+
+        template<typename A, typename B>
+        static ptr_type matmul_2d(const std::shared_ptr<DPTensorX<A>> & a_ptr, const std::shared_ptr<DPTensorX<B>> & b_ptr, int axis)
+        {
+            if(!a_ptr->slice().is_equally_tiled() || !b_ptr->slice().is_equally_tiled())
+                throw(std::runtime_error("vecdoc_2d supported for eually tiled tensors only"));
+            if(a_ptr->slice().split_dim() != 0)
+                throw(std::runtime_error("vecdoc_2d supported for split_dim=0 only"));
+
+            auto nr = theTransceiver->nranks();
+            auto me = theTransceiver->rank();
+            rank_type right = (me + 1) % nr;
+            rank_type left = (nr + me - 1) % nr;
+            auto tsz = b_ptr->slice().tile_size(0);
+            auto tshpa = a_ptr->slice().tile_shape(0);
+            auto tshpb = b_ptr->slice().tile_shape(0);
+
+            const auto & ax = a_ptr->xarray();
+            const auto & bx = b_ptr->xarray();
+            xt::xarray<A> cx = xt::zeros<A>({tshpa[0], tshpb[1]});
+            auto buff = xt::empty_like(bx);
+            buff = bx;
+
+            // We use an algo similar to Canon's
+            for(rank_type i = nr; i>0; --i) {
+                // std::cerr << me*tshpb[0] << " " << (1+me) * tshpb[0] << std::endl;
+                // auto av = xt::view(ax, xt::all(), xt::range(me * tshpb[0], (1+me) * tshpb[0]));
+                // cx = cx + xt::linalg::dot(av, buff);
+                TGEMM<A>::tgemm(CblasRowMajor,
+                                CblasNoTrans,
+                                CblasNoTrans,
+                                tshpa[0],
+                                tshpb[1],
+                                tshpb[0],
+                                1, // alpha
+                                ax.data() + (me * tshpb[0]),
+                                tshpa[1], // lda
+                                buff.data(),
+                                tshpb[1], // ldb
+                                1, // beta
+                                cx.data(),
+                                tshpb[1]); // ldc
+                
+                if(i > 1) {
+                    // data exchange
+                    theTransceiver->send_recv(buff.data(),
+                                              tsz,
+                                              DTYPE<A>::value,
+                                              left,
+                                              right);
+                    me = (me + 1) % nr;
+                }
+            }
+            return operatorx<A>::mk_tx(std::move(PVSlice({a_ptr->slice().shape()[0], b_ptr->slice().shape()[1]})), cx);
+        }
+    };
+}
+
+tensor_i::ptr_type LinAlgOp::vecdot(tensor_i::ptr_type a, tensor_i::ptr_type b, int axis)
+{
+    return TypeDispatch<x::LinAlgOp>(a, b, axis);
+}
diff --git a/src/MPITransceiver.cpp b/src/MPITransceiver.cpp
@@ -92,3 +92,21 @@ void MPITransceiver::alltoall(const void* buffer_send,
                   to_mpi(datatype_recv),
                   MPI_COMM_WORLD);
 }
+
+void MPITransceiver::send_recv(void* buffer_send,
+                               int count_send,
+                               DTypeId datatype_send,
+                               int dest,
+                               int source)
+{
+    constexpr int SRTAG = 505;
+    MPI_Sendrecv_replace(buffer_send,
+                         count_send,
+                         to_mpi(datatype_send),
+                         dest,
+                         SRTAG,
+                         source,
+                         SRTAG,
+                         MPI_COMM_WORLD,
+                         MPI_STATUS_IGNORE);
+}
diff --git a/src/ReduceOp.cpp b/src/ReduceOp.cpp
@@ -63,7 +63,7 @@ namespace x {
     };
 } // namespace x
 
-tensor_i::ptr_type ReduceOp::op(ReduceOpId op, x::DPTensorBaseX::ptr_type a, const dim_vec_type & dim)
+tensor_i::ptr_type ReduceOp::op(ReduceOpId op, tensor_i::ptr_type a, const dim_vec_type & dim)
 {
     return TypeDispatch<x::ReduceOp>(a, op, dim);
 }
diff --git a/src/ddptensor.cpp b/src/ddptensor.cpp
@@ -29,6 +29,7 @@ using namespace pybind11::literals; // to bring _a
 #include "ddptensor/ManipOp.hpp"
 #include "ddptensor/SetGetItem.hpp"
 #include "ddptensor/Random.hpp"
+#include "ddptensor/LinAlgOp.hpp"
 
 // #########################################################################
 // The following classes are wrappers bridging pybind11 defs to TypeDispatch
@@ -85,6 +86,9 @@ PYBIND11_MODULE(_ddptensor, m) {
     py::class_<ManipOp>(m, "ManipOp")
         .def("reshape", &ManipOp::reshape);
 
+    py::class_<LinAlgOp>(m, "LinAlgOp")
+        .def("vecdot", &LinAlgOp::vecdot);
+
     py::class_<tensor_i, tensor_i::ptr_type>(m, "DPTensorX")
         .def_property_readonly("dtype", &tensor_i::dtype)
         .def_property_readonly("shape", &tensor_i::shape)
diff --git a/src/include/ddptensor/LinAlgOp.hpp b/src/include/ddptensor/LinAlgOp.hpp
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include "UtilsAndTypes.hpp"
+#include "tensor_i.hpp"
+#include "p2c_ids.hpp"
+
+struct LinAlgOp
+{
+    static tensor_i::ptr_type vecdot(tensor_i::ptr_type a, tensor_i::ptr_type b, int axis);
+};
diff --git a/src/include/ddptensor/MPITransceiver.hpp b/src/include/ddptensor/MPITransceiver.hpp
@@ -31,6 +31,11 @@ class MPITransceiver : public Transceiver
                           const int* counts_recv,
                           const int* displacements_recv,
                           DTypeId datatype_recv);
+    virtual void send_recv(void* buffer_send,
+                           int count_send,
+                           DTypeId datatype_send,
+                           int dest,
+                           int source);
 
 private:
     rank_type _nranks, _rank;
diff --git a/src/include/ddptensor/PVSlice.hpp b/src/include/ddptensor/PVSlice.hpp
@@ -27,6 +27,7 @@ class BasePVSlice
     {
         _tile_size = VPROD(_shape) / shape[_split_dim] * _offset;
     }
+
     BasePVSlice(shape_type && shape, int split=0)
         : _offset(split == NOSPLIT ? 0 : (shape[split] + theTransceiver->nranks() - 1) / theTransceiver->nranks()),
           _shape(std::move(shape)),
@@ -35,21 +36,40 @@ class BasePVSlice
         _tile_size = VPROD(_shape) / _shape[_split_dim] * _offset;
     }
 
-    uint64_t offset() const { return _offset; }
+    bool is_equally_tiled() const
+    {
+        return _shape[_split_dim] == theTransceiver->nranks() * _offset;
+    }
+
+    uint64_t offset() const
+    {
+        return _offset;
+    }
+
     uint64_t tile_size(rank_type rank = theTransceiver->rank()) const
     {
-        if(rank < theTransceiver->nranks() - 1) return _tile_size;
+        if(rank == 0 || rank < theTransceiver->nranks() - 1) return _tile_size;
         return VPROD(_shape) - (rank-1 * _tile_size);
     }
+
     shape_type tile_shape(rank_type rank = theTransceiver->rank()) const
     {
         shape_type r(_shape);
-        if(rank < theTransceiver->nranks() - 1) r[_split_dim] = offset();
+        if(rank == 0 || rank < theTransceiver->nranks() - 1) r[_split_dim] = offset();
         else r[_split_dim] = r[_split_dim] - (rank-1 * offset());
         return r;
     }
-    int split_dim() const { return _split_dim; }
-    const shape_type & shape() const { return _shape; }
+
+    int split_dim() const
+    {
+        return _split_dim;
+    }
+
+    const shape_type & shape() const
+    {
+        return _shape;
+    }
+
     shape_type shape(rank_type rank) const
     {
         if(split_dim() == NOSPLIT) {
@@ -61,6 +81,7 @@ class BasePVSlice
         else shp[_split_dim] = end - _shape[_split_dim];
         return shp;
     }
+
     rank_type owner(const NDSlice & slice) const
     {
         return split_dim() == NOSPLIT ? theTransceiver->rank() : slice.dim(split_dim())._start / offset();
@@ -145,6 +166,11 @@ class PVSlice
         return base_shape() != shape();
     }
 
+    bool is_equally_tiled() const
+    {
+        return _base->is_equally_tiled();
+    }
+
     const uint64_t tile_size(rank_type rank = theTransceiver->rank()) const
     {
         return _base->tile_size(rank);
diff --git a/src/include/ddptensor/Transceiver.hpp b/src/include/ddptensor/Transceiver.hpp
@@ -38,6 +38,12 @@ class Transceiver
                           const int* counts_recv,
                           const int* displacements_recv,
                           DTypeId datatype_recv) = 0;
+
+    virtual void send_recv(void* buffer_send,
+                           int count_send,
+                           DTypeId datatype_send,
+                           int dest,
+                           int source) = 0;
 };
 
 extern Transceiver * theTransceiver;
diff --git a/src/include/ddptensor/x.hpp b/src/include/ddptensor/x.hpp
diff --git a/test/test_linalg.py b/test/test_linalg.py
diff --git a/test/test_red.py b/test/test_red.py

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ namespace x {`
`63`	`63`	`};`
`64`	`64`	`} // namespace x`
`65`	`65`
`66`		`-tensor_i::ptr_type ReduceOp::op(ReduceOpId op, x::DPTensorBaseX::ptr_type a, const dim_vec_type & dim)`
	`66`	`+tensor_i::ptr_type ReduceOp::op(ReduceOpId op, tensor_i::ptr_type a, const dim_vec_type & dim)`
`67`	`67`	`{`
`68`	`68`	`return TypeDispatch<x::ReduceOp>(a, op, dim);`
`69`	`69`	`}`