llvm · arpitj1 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -85,3 +85,9 @@ pythonenv*
 # tmp output from tests
 *.exec1
 *.out1
+
+# Local-environment-specific scripts (carry SSH hostnames, IPs, usernames
+# for a particular dev machine + Jetson setup). Each developer has their
+# own version of these.
+scripts/correctness/run_jetson.sh
+scripts/correctness/logs/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,67 @@
+# Polygeist - Claude Instructions
+
+## Environment Setup
+
+Source this before running any commands:
+```bash
+export POLYGEIST_ROOT=/path/to/Polygeist
+source "$POLYGEIST_ROOT/envsetup.sh"
+```
+This adds `build/bin/` to PATH, making `cgeist` and `polygeist-opt` available.
+
+## Build
+
+Only `build_polygeist.sh` is needed (LLVM/MLIR/Clang are pre-built in `llvm-project/build`).
+
+To rebuild after making changes to any pass:
+```bash
+cd "$POLYGEIST_ROOT/build" && ninja
+```
+
+## Raising Pipeline (C → Linalg)
+
+```bash
+# Step 1: C to affine MLIR
+cgeist <file.c> --function=* --resource-dir=/usr/lib/clang/14 --raise-scf-to-affine -fPIC -S -g -c -o output.mlir
+
+# Step 2: Affine → Linalg (memref form)
+polygeist-opt --select-func="func-name=<funcname>" --remove-iter-args --affine-parallelize --raise-affine-to-linalg-pipeline <input.mlir> -o <output_linalg.mlir>
+
+# Step 3: Debufferize (memref linalg → tensor linalg)
+polygeist-opt --linalg-debufferize <input_linalg.mlir> -o <output_debufferized.mlir>
+
+# Step 4: Kernel extraction
+polygeist-opt <input_debufferized.mlir> --linalg-to-kernel="kernel-library-path=$POLYGEIST_ROOT/generic_solver/kernel_library.mlir"
+```
+
+## Key Source Files
+
+- `lib/polygeist/Passes/RaiseToLinalg.cpp` — raises `affine.for` loops to `linalg.generic`, creates `polygeist.submap` for strided accesses
+- `lib/polygeist/Passes/LinalgDebufferize.cpp` — converts memref-based linalg to tensor-based SSA form
+- `include/polygeist/PolygeistOps.td` — defines `polygeist.submap` and `polygeist.submapInverse`
+
+## NVIDIA gated-distribution SDKs — point, don't copy
+
+The directory `$PVASOL_ROOT` is the source tree for the PVA
+Solutions SDK. The PVA Solutions public `.deb` packages ship binaries only
+(`libpva_operator.so`, `libnvcv_types.so`, allowlist file) — *no headers*.
+Headers exist only inside the source tree, which NVIDIA distributes to
+approved developers through `developer.nvidia.com/embedded/pva`. The headers
+are therefore "behind a developer-program gate," not "secret internal-only";
+they're the same files any approved external developer would have.
+
+*Rule for using these headers in Polygeist:*
+
+- *Build-time include path is fine.* Add `-I$PVASOL_ROOT/public/src/operator/include`
+  (and the same pattern for NVCV / cuPVA / CV-CUDA headers under `public/3rdparty/`)
+  to the cross-compile flags in our build scripts.
+- *Never copy headers into the Polygeist tree.* No `cp` / `git add` of any
+  `.h` / `.hpp` / `.cpp` / `.c` from `$PVASOL_ROOT` into
+  `$POLYGEIST_ROOT`. The Polygeist repo only ever references those
+  paths symbolically.
+- *Polygeist source code may `#include "OpConv2d.h"` etc.* — the include is
+  resolved through the `-I` flag at build time, just like cuDNN's `cudnn.h`.
+- *Anyone cloning Polygeist without PVA Solutions access gets a clean build
+  failure* — same as the cuDNN dependency on the cross-compile path today.
+- *Same policy applies* to any other gated-distribution NVIDIA SDK source
+  tree on this VM (cuPVA SDK, internal NVCV builds, etc.).
diff --git a/blas/dasum.c b/blas/dasum.c
@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+// DASUM: Sum of absolute values
+// result = sum(|x[i]|)
+// x: vector of length N with stride incx
+double dasum(int N, const double* x, int incx) {
+    double result = 0.0;
+
+    for (int i = 0; i < N; i++) {
+        result += fabs(x[i * incx]);
+    }
+
+    return result;
+}
+
+// Simple version (stride = 1)
+double simple_dasum(int N, const double* x) {
+    double result = 0.0;
+
+    for (int i = 0; i < N; i++) {
+        result += fabs(x[i]);
+    }
+
+    return result;
+}
+
+// Single precision version
+float sasum(int N, const float* x, int incx) {
+    float result = 0.0f;
+
+    for (int i = 0; i < N; i++) {
+        result += fabsf(x[i * incx]);
+    }
+
+    return result;
+}
+
+void print_vector(const double* x, int N, const char* name) {
+    printf("%s: [", name);
+    for (int i = 0; i < N; i++) {
+        printf("%.1f", x[i]);
+        if (i < N - 1) printf(", ");
+    }
+    printf("]\n");
+}
+
+int main() {
+    const int N = 6;
+
+    double x[] = {1.0, -2.0, 3.0, -4.0, 5.0, -6.0};
+
+    printf("ASUM Test: sum of absolute values\n");
+    print_vector(x, N, "x");
+
+    double result = simple_dasum(N, x);
+
+    printf("\nasum(x) = %.1f\n", result);
+
+    printf("\nManual verification:\n");
+    printf("|1.0| + |-2.0| + |3.0| + |-4.0| + |5.0| + |-6.0|\n");
+    printf("= 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0\n");
+    printf("= 21.0\n");
+
+    // Test with stride
+    printf("\n\nTesting with stride=2 (every other element):\n");
+    double result_stride = dasum(3, x, 2);
+    printf("asum(x[::2]) = %.1f\n", result_stride);
+    printf("Manual: |%.1f| + |%.1f| + |%.1f| = %.1f\n",
+           x[0], x[2], x[4], fabs(x[0]) + fabs(x[2]) + fabs(x[4]));
+
+    return 0;
+}
diff --git a/blas/daxpy.c b/blas/daxpy.c
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+// DAXPY: Constant times a vector plus a vector
+// y = alpha * x + y
+// x: vector of length N with stride incx
+// y: vector of length N with stride incy (modified in place)
+// alpha: scaling factor
+void daxpy(int N, double alpha, const double* x, int incx, double* y, int incy) {
+    for (int i = 0; i < N; i++) {
+        y[i * incy] += alpha * x[i * incx];
+    }
+}
+
+// Simple version (stride = 1)
+void simple_daxpy(int N, double alpha, const double* x, double* y) {
+    for (int i = 0; i < N; i++) {
+        y[i] += alpha * x[i];
+    }
+}
+
+// Single precision version
+void saxpy(int N, float alpha, const float* x, int incx, float* y, int incy) {
+    for (int i = 0; i < N; i++) {
+        y[i * incy] += alpha * x[i * incx];
+    }
+}
+
+void print_vector(const double* x, int N, const char* name) {
+    printf("%s: [", name);
+    for (int i = 0; i < N; i++) {
+        printf("%.2f", x[i]);
+        if (i < N - 1) printf(", ");
+    }
+    printf("]\n");
+}
+
+int main() {
+    const int N = 5;
+    const double alpha = 2.0;
+
+    double x[] = {1.0, 2.0, 3.0, 4.0, 5.0};
+    double y[] = {10.0, 20.0, 30.0, 40.0, 50.0};
+
+    printf("AXPY Test: y = alpha * x + y\n");
+    printf("alpha = %.2f\n", alpha);
+    print_vector(x, N, "x");
+    print_vector(y, N, "y (before)");
+
+    // Apply axpy
+    simple_daxpy(N, alpha, x, y);
+
+    print_vector(y, N, "y (after)");
+
+    printf("\nManual verification:\n");
+    printf("y[0] = 2.0*1.0 + 10.0 = 12.00\n");
+    printf("y[1] = 2.0*2.0 + 20.0 = 24.00\n");
+    printf("y[2] = 2.0*3.0 + 30.0 = 36.00\n");
+    printf("y[3] = 2.0*4.0 + 40.0 = 48.00\n");
+    printf("y[4] = 2.0*5.0 + 50.0 = 60.00\n");
+
+    // Test with stride
+    printf("\n\nTesting with stride=2:\n");
+    double x2[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double y2[] = {100.0, 200.0, 300.0, 400.0, 500.0, 600.0};
+
+    printf("x: [1, 2, 3, 4, 5, 6]\n");
+    printf("y (before): [100, 200, 300, 400, 500, 600]\n");
+    printf("Computing: y[::2] += 10.0 * x[::2]\n");
+
+    daxpy(3, 10.0, x2, 2, y2, 2); // y[0,2,4] += 10*x[0,2,4]
+
+    printf("y (after): [%.1f, %.1f, %.1f, %.1f, %.1f, %.1f]\n",
+           y2[0], y2[1], y2[2], y2[3], y2[4], y2[5]);
+    printf("Expected: [110.0, 200.0, 330.0, 400.0, 550.0, 600.0]\n");
+
+    return 0;
+}
diff --git a/blas/dcopy.c b/blas/dcopy.c
@@ -0,0 +1,76 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+// DCOPY: Copy vector x to vector y
+// y = x
+// x: source vector of length N with stride incx
+// y: destination vector of length N with stride incy
+void dcopy(int N, const double* x, int incx, double* y, int incy) {
+    for (int i = 0; i < N; i++) {
+        y[i * incy] = x[i * incx];
+    }
+}
+
+// Simple version (stride = 1)
+void simple_dcopy(int N, const double* x, double* y) {
+    for (int i = 0; i < N; i++) {
+        y[i] = x[i];
+    }
+}
+
+// Single precision version
+void scopy(int N, const float* x, int incx, float* y, int incy) {
+    for (int i = 0; i < N; i++) {
+        y[i * incy] = x[i * incx];
+    }
+}
+
+void print_vector(const double* x, int N, const char* name) {
+    printf("%s: [", name);
+    for (int i = 0; i < N; i++) {
+        printf("%.1f", x[i]);
+        if (i < N - 1) printf(", ");
+    }
+    printf("]\n");
+}
+
+int main() {
+    const int N = 5;
+
+    double x[] = {1.0, 2.0, 3.0, 4.0, 5.0};
+    double y[5] = {0.0, 0.0, 0.0, 0.0, 0.0};
+
+    printf("COPY Test\n");
+    print_vector(x, N, "x (source)");
+    print_vector(y, N, "y (before)");
+
+    // Copy x to y
+    simple_dcopy(N, x, y);
+
+    print_vector(y, N, "y (after)");
+
+    // Verify
+    printf("\nVerification: ");
+    int correct = 1;
+    for (int i = 0; i < N; i++) {
+        if (x[i] != y[i]) {
+            correct = 0;
+            break;
+        }
+    }
+    printf("%s\n", correct ? "PASS" : "FAIL");
+
+    // Test with stride
+    printf("\n\nTesting with stride:\n");
+    double src[] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0};
+    double dst[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+
+    printf("Source: [10, 20, 30, 40, 50, 60]\n");
+    printf("Copying every other element (incx=2) to every position (incy=1):\n");
+    dcopy(3, src, 2, dst, 1); // Copy src[0,2,4] to dst[0,1,2]
+    printf("Result: [%.1f, %.1f, %.1f, %.1f, %.1f, %.1f]\n",
+           dst[0], dst[1], dst[2], dst[3], dst[4], dst[5]);
+    printf("Expected: [10.0, 30.0, 50.0, 0.0, 0.0, 0.0]\n");
+
+    return 0;
+}
diff --git a/blas/ddot.c b/blas/ddot.c
@@ -0,0 +1,79 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+// DDOT: Compute dot product of two vectors
+// result = sum(x[i] * y[i])
+// x: vector of length N with stride incx
+// y: vector of length N with stride incy
+double ddot(int N, const double* x, int incx, const double* y, int incy) {
+    double result = 0.0;
+
+    for (int i = 0; i < N; i++) {
+        result += x[i * incx] * y[i * incy];
+    }
+
+    return result;
+}
+
+// Simple version (stride = 1)
+double simple_ddot(int N, const double* x, const double* y) {
+    double result = 0.0;
+
+    for (int i = 0; i < N; i++) {
+        result += x[i] * y[i];
+    }
+
+    return result;
+}
+
+// Single precision version
+float sdot(int N, const float* x, int incx, const float* y, int incy) {
+    float result = 0.0f;
+
+    for (int i = 0; i < N; i++) {
+        result += x[i * incx] * y[i * incy];
+    }
+
+    return result;
+}
+
+int main() {
+    const int N = 5;
+    double x[] = {1.0, 2.0, 3.0, 4.0, 5.0};
+    double y[] = {2.0, 3.0, 4.0, 5.0, 6.0};
+
+    printf("DOT Product Test\n");
+    printf("x: [");
+    for (int i = 0; i < N; i++) {
+        printf("%.1f ", x[i]);
+    }
+    printf("]\n");
+
+    printf("y: [");
+    for (int i = 0; i < N; i++) {
+        printf("%.1f ", y[i]);
+    }
+    printf("]\n\n");
+
+    // Test simple version
+    double result = simple_ddot(N, x, y);
+    printf("dot(x, y) = %.1f\n", result);
+
+    // Manual verification
+    double manual = 0.0;
+    for (int i = 0; i < N; i++) {
+        manual += x[i] * y[i];
+        printf("  %.1f * %.1f = %.1f\n", x[i], y[i], x[i] * y[i]);
+    }
+    printf("Expected: %.1f, Actual: %.1f\n\n", manual, result);
+
+    // Test with stride
+    printf("Testing with stride=2 (every other element):\n");
+    double result_stride = ddot(3, x, 2, y, 2);
+    printf("dot(x[::2], y[::2]) = %.1f\n", result_stride);
+    printf("Manual: %.1f*%.1f + %.1f*%.1f + %.1f*%.1f = %.1f\n",
+           x[0], y[0], x[2], y[2], x[4], y[4],
+           x[0]*y[0] + x[2]*y[2] + x[4]*y[4]);
+
+    return 0;
+}