adding caching of compiled code; adding Tosa bits

fschlimb · fschlimb · commit 1edc643de0ab · 2023-03-17T03:53:07.000-05:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -185,6 +185,8 @@ target_link_libraries(_ddpt_rt PRIVATE
     MLIRLinalgTransforms
     MLIRLLVMDialect
     MLIRMathDialect
+    MLIRMathToFuncs
+    MLIRMathToLibm
     MLIRMathToLLVM
     MLIRMathTransforms
     MLIRMemRefDialect
@@ -197,6 +199,8 @@ target_link_libraries(_ddpt_rt PRIVATE
     MLIRShapeDialect
     MLIRShapeOpsTransforms
     MLIRShapeToStandard
+    MLIRTosaDialect
+    MLIRTosaToLinalg
     MLIRTensorTransforms
 )
     # LLVM${LLVM_NATIVE_ARCH}CodeGen
diff --git a/src/DDPTensorImpl.cpp b/src/DDPTensorImpl.cpp
@@ -162,8 +162,6 @@ void DDPTensorImpl::add_to_args(std::vector<void *> &args, int ndims) {
   buff[2] = static_cast<intptr_t>(_offset);
   memcpy(buff + 3, _sizes, ndims * sizeof(intptr_t));
   memcpy(buff + 3 + ndims, _strides, ndims * sizeof(intptr_t));
-  for (auto i = 0; i < 3 + 2 * ndims; ++i)
-    std::cerr << " " << buff[i];
   args.push_back(buff);
   // second the transceiver
   args.push_back(&_transceiver);
diff --git a/src/Deferred.cpp b/src/Deferred.cpp
@@ -20,7 +20,6 @@
 #include <oneapi/tbb/concurrent_queue.h>
 
 #include <iostream>
-#include <unordered_set>
 
 // thread-safe FIFO queue holding deferred objects
 static tbb::concurrent_bounded_queue<Runable::ptr_type> _deferred;
@@ -71,7 +70,7 @@ void Runable::defer(Runable::ptr_type &&p) { push_runable(std::move(p)); }
 void Runable::fini() { _deferred.clear(); }
 
 // process promises as they arrive through calls to defer
-// This is run in a separate thread until shutdon is requested.
+// This is run in a separate thread until shutdown is requested.
 // Shutdown is indicated by a Deferred object which evaluates to false.
 // The loop repeatedly creates MLIR functions for jit-compilation by letting
 // Deferred objects add their MLIR code until an object can not produce MLIR
@@ -138,14 +137,12 @@ void process_promises() {
 
       if (osz > 0 || !input.empty()) {
         // compile and run the module
-        intptr_t *output = new intptr_t[osz];
-        if (jit.run(module, fname, input, output))
+        auto output = jit.run(module, fname, input, osz);
+        if (output.size() != osz)
           throw std::runtime_error("failed running jit");
 
         // push results to deliver promises
         dm.deliver(output, osz);
-
-        delete[] output;
       } else {
         std::cerr << "\tskipping\n";
       }
diff --git a/src/include/ddptensor/jit/mlir.hpp b/src/include/ddptensor/jit/mlir.hpp
@@ -113,7 +113,7 @@ class DepManager {
   uint64_t handleResult(::mlir::OpBuilder &builder);
 
   /// devlier promise after execution
-  void deliver(intptr_t *, uint64_t);
+  void deliver(std::vector<intptr_t> &, uint64_t);
 
   /// @return total size of all input arguments in number of intptr_t
   uint64_t arg_size();
@@ -137,12 +137,13 @@ class JIT {
 
   JIT();
   // run
-  int run(::mlir::ModuleOp &, const std::string &, std::vector<void *> &,
-          intptr_t *);
+  std::vector<intptr_t> run(::mlir::ModuleOp &, const std::string &,
+                            std::vector<void *> &, size_t);
 
   ::mlir::MLIRContext _context;
   ::mlir::PassManager _pm;
-  bool _verbose;
+  bool _verbose, _useCache;
+  const char *_sharedLibPaths;
 };
 
 // size of memreftype in number of intptr_t's
diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp
@@ -65,19 +65,20 @@
 // #include "mlir/Dialect/SparseTensor/Pipelines/Passes.h"
 // #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
 #include "mlir/Dialect/Tensor/Transforms/Passes.h"
-// #include "mlir/Dialect/Tosa/Transforms/Passes.h"
+#include "mlir/Dialect/Tosa/Transforms/Passes.h"
 // #include "mlir/Dialect/Transform/Transforms/Passes.h"
 // #include "mlir/Dialect/Vector/Transforms/Passes.h"
 #include "mlir/Transforms/Passes.h"
 // #include <mlir/InitAllPasses.h>
 
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
-
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 
+#include <llvm/Support/raw_sha1_ostream.h>
+
 #include <imex/Dialect/PTensor/IR/PTensorOps.h>
 #include <imex/InitIMEXDialects.h>
 #include <imex/InitIMEXPasses.h>
@@ -178,7 +179,6 @@ std::vector<void *> DepManager::store_inputs() {
   std::vector<void *> res;
   for (auto a : _args) {
     auto f = Registry::get(a.first);
-    std::cerr << " store guid " << a.first;
     f.get().get()->add_to_args(res, a.second);
     _ivm.erase(a.first); // inputs need no delivery
     _icm.erase(a.first);
@@ -254,7 +254,8 @@ uint64_t DepManager::handleResult(::mlir::OpBuilder &builder) {
   return 2 * sz;
 }
 
-void DepManager::deliver(intptr_t *output, uint64_t sz) {
+void DepManager::deliver(std::vector<intptr_t> &outputV, uint64_t sz) {
+  auto output = outputV.data();
   size_t pos = 0;
   for (auto &v : _icm) {
     auto rank = _irm[v.first];
@@ -305,14 +306,30 @@ void DepManager::deliver(intptr_t *output, uint64_t sz) {
   }
 }
 
-int JIT::run(::mlir::ModuleOp &module, const std::string &fname,
-             std::vector<void *> &inp, intptr_t *out) {
-  // lower to LLVM
-  if (::mlir::failed(_pm.run(module)))
-    throw std::runtime_error("failed to run pass manager");
-
-  if (_verbose)
-    module.dump();
+std::vector<intptr_t> JIT::run(::mlir::ModuleOp &module,
+                               const std::string &fname,
+                               std::vector<void *> &inp, size_t osz) {
+  if (_useCache) {
+    ::mlir::ModuleOp cached;
+    static std::vector<
+        std::pair<std::array<unsigned char, 20>, ::mlir::ModuleOp>>
+        cache;
+    llvm::raw_sha1_ostream xxx;
+    module->print(xxx);
+    auto cksm = xxx.sha1();
+    for (auto x : cache) {
+      if (x.first == cksm) {
+        cached = x.second;
+        break;
+      }
+    }
+    if (cached) {
+      module = cached;
+      std::cerr << "using cached module" << std::endl;
+    } else {
+      cache.push_back(std::make_pair(cksm, module));
+    }
+  }
 
   // An optimization pipeline to use within the execution engine.
   auto optPipeline =
@@ -322,21 +339,27 @@ int JIT::run(::mlir::ModuleOp &module, const std::string &fname,
 
   // Create an ::mlir execution engine. The execution engine eagerly
   // JIT-compiles the module.
-  ::mlir::ExecutionEngineOptions engineOptions;
-  engineOptions.transformer = optPipeline;
-  // const char * crunner = getenv("DDPT_CRUNNER_SO");
-  // crunner = crunner ? crunner : "libmlir_c_runner_utils.so";
-  const char *idtr = getenv("DDPT_IDTR_SO");
-  idtr = idtr ? idtr : "libidtr.so";
-  // ::llvm::ArrayRef<::llvm::StringRef> shlibs = {crunner, idtr};
-  engineOptions.sharedLibPaths = {idtr};
-  auto maybeEngine = ::mlir::ExecutionEngine::create(module, engineOptions);
+  ::mlir::ExecutionEngineOptions opts;
+  opts.transformer = optPipeline;
+  opts.sharedLibPaths = {_sharedLibPaths};
+  opts.enableObjectDump = _useCache;
+
+  // lower to LLVM
+  if (::mlir::failed(_pm.run(module)))
+    throw std::runtime_error("failed to run pass manager");
+
+  if (_verbose)
+    module.dump();
+
+  auto maybeEngine = ::mlir::ExecutionEngine::create(module, opts);
   assert(maybeEngine && "failed to construct an execution engine");
   auto &engine = maybeEngine.get();
 
   llvm::SmallVector<void *> args;
+  std::vector<intptr_t> out(osz);
+  auto tmp = out.data();
   // first arg must be the result ptr
-  args.push_back(&out);
+  args.push_back(&tmp);
   // we need a void*& for every input tensor
   // we refer directly to the storage in inp
   for (auto &arg : inp) {
@@ -350,7 +373,7 @@ int JIT::run(::mlir::ModuleOp &module, const std::string &fname,
     throw std::runtime_error("JIT invocation failed");
   }
 
-  return 0;
+  return out;
 }
 
 static const char *pass_pipeline =
@@ -362,11 +385,13 @@ static const char *pass_pipeline =
         //    "builtin.module(func.func(ptensor-dist),convert-dist-to-standard,convert-ptensor-to-linalg,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,scf-bufferize,linalg-bufferize,tensor-bufferize,bufferization-bufferize),func-bufferize,func.func(finalizing-bufferize,convert-linalg-to-parallel-loops),canonicalize,fold-memref-alias-ops,expand-strided-metadata,lower-affine,convert-scf-to-cf,convert-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)";
         : "func.func(ptensor-dist,dist-coalesce),convert-dist-to-standard,"
           "convert-ptensor-to-linalg,canonicalize,convert-shape-to-std,arith-"
-          "expand,canonicalize,arith-bufferize,func-bufferize,func.func(empty-"
-          "tensor-to-alloc-tensor,scf-bufferize,tensor-bufferize,linalg-"
+          "expand,canonicalize,arith-bufferize,func-bufferize,func.func(tosa-"
+          "to-linalg,"
+          "empty-tensor-to-alloc-tensor,scf-bufferize,tensor-bufferize,linalg-"
           "bufferize,bufferization-bufferize,linalg-detensorize,tensor-"
           "bufferize,finalizing-bufferize,convert-linalg-to-parallel-loops),"
-          "canonicalize,fold-memref-alias-ops,expand-strided-metadata,lower-"
+          "canonicalize,fold-memref-alias-ops,expand-strided-metadata,convert-"
+          "math-to-funcs,convert-math-to-libm,lower-"
           "affine,convert-scf-to-cf,convert-memref-to-llvm,convert-func-to-"
           "llvm,reconcile-unrealized-casts";
 JIT::JIT()
@@ -391,12 +416,27 @@ JIT::JIT()
     if (v == "1" || v == "y" || v == "Y" || v == "on" || v == "ON")
       _verbose = true;
   }
+  _pm.enableTiming();
   // some verbosity
   if (_verbose) {
     _pm.enableStatistics();
     _pm.enableIRPrinting();
     _pm.dump();
   }
+
+  const char *envptr = getenv("DDPT_USE_CACHE");
+  envptr = envptr ? envptr : "1";
+  {
+    auto c = std::string(envptr);
+    _useCache = c == "1" || c == "y" || c == "Y" || c == "on" || c == "ON";
+    std::cerr << "enableObjectDump=" << _useCache << std::endl;
+  }
+
+  // const char * crunner = getenv("DDPT_CRUNNER_SO");
+  // crunner = crunner ? crunner : "libmlir_c_runner_utils.so";
+  envptr = getenv("DDPT_IDTR_SO");
+  _sharedLibPaths = envptr ? envptr : "libidtr.so";
+  // ::llvm::ArrayRef<::llvm::StringRef> shlibs = {crunner, envptr};
 }
 
 // register dialects and passes
@@ -411,6 +451,10 @@ void init() {
   ::mlir::registerConvertShapeToStandardPass();
   ::mlir::tensor::registerTensorPasses();
   ::mlir::registerLinalgPasses();
+  ::mlir::registerTosaToLinalg();
+  ::mlir::registerConvertMathToFuncs();
+  ::mlir::registerConvertMathToLibm();
+  ::mlir::tosa::registerTosaOptPasses();
   ::mlir::func::registerFuncPasses();
   ::mlir::registerConvertFuncToLLVMPass();
   ::mlir::bufferization::registerBufferizationPasses();
diff --git a/test/stencil-2d.py b/test/stencil-2d.py
@@ -119,6 +119,7 @@ def main():
 
     # there is certainly a more Pythonic way to initialize W,
     # but it will have no impact on performance.
+    t0 = timer()
     W = np.zeros(((2 * r + 1), (2 * r + 1)), dtype=np.float64)
     A = np.empty((n, n), dtype=np.float64)
     B = np.zeros((n, n), dtype=np.float64)
@@ -149,8 +150,8 @@ def main():
 
     for k in range(iterations + 1):
         # start timer after a warmup iteration
+        np.sync()
         if k <= 1:
-            np.sync()
             t0 = timer()
 
         if pattern == "star":