Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,26 @@ wasm-clean: ## Remove WASM build artefacts (pkg/)
$(call warn,Removing $(WASM_CRATE)/pkg/ ...)
@rm -rf $(WASM_CRATE)/pkg/

# ══════════════════════════════════════════════════════════════════════════════
## WASI / RISC-V Integration Tests (Docker-based)
# ══════════════════════════════════════════════════════════════════════════════

WASI_SCRIPT := tests/wasm-runtimes/wasm-test.sh

wasi-build: ## Build all WASM runtime + RISC-V Docker test images
$(call log,Building WASM/RISC-V integration test images…)
@$(WASI_SCRIPT) build all

wasi-test: ## Run WASM/RISC-V integration tests across all runtimes
$(call log,Running WASM/RISC-V integration tests…)
@$(WASI_SCRIPT) test all

wasi-status: ## Show Docker image / container status for WASI tests
@$(WASI_SCRIPT) status

wasi-clean: ## Remove all WASI test Docker images and artefacts
@$(WASI_SCRIPT) clean

# ══════════════════════════════════════════════════════════════════════════════
## Clean
# ══════════════════════════════════════════════════════════════════════════════
Expand Down
8 changes: 6 additions & 2 deletions crates/edgeparse-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@ documentation = "https://docs.rs/edgeparse-cli"
name = "edgeparse"
path = "src/main.rs"

[features]
default = ["native"]
native = ["rayon", "edgeparse-core/native"]

[dependencies]
edgeparse-core = { path = "../edgeparse-core", version = "0.2.0" }
edgeparse-core = { path = "../edgeparse-core", version = "0.2.0", default-features = false }
clap = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
anyhow = { workspace = true }
log = { workspace = true }
env_logger = { workspace = true }
rayon = { workspace = true }
rayon = { workspace = true, optional = true }
41 changes: 37 additions & 4 deletions crates/edgeparse-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use std::sync::atomic::{AtomicBool, Ordering};

use clap::Parser;
use edgeparse_core::api::config::OutputFormat;
#[cfg(feature = "native")]
use rayon::prelude::*;

/// EdgeParse: High-performance PDF-to-structured-data extraction
Expand Down Expand Up @@ -124,10 +125,12 @@ fn main() {
// Build processing config
let config = build_config(&cli);

// Process each input file in parallel
// Process each input file (parallel when native feature is enabled)
let has_errors = AtomicBool::new(false);
cli.input.par_iter().for_each(|input_path| {
match edgeparse_core::convert(input_path, &config) {

let process_file = |input_path: &PathBuf| {
let result = convert_file(input_path, &config);
match result {
Ok(doc) => {
log::info!(
"Processed {} ({} pages)",
Expand All @@ -144,7 +147,13 @@ fn main() {
has_errors.store(true, Ordering::Relaxed);
}
}
});
};

#[cfg(feature = "native")]
cli.input.par_iter().for_each(process_file);

#[cfg(not(feature = "native"))]
cli.input.iter().for_each(process_file);

if has_errors.load(Ordering::Relaxed) {
process::exit(1);
Expand Down Expand Up @@ -202,6 +211,30 @@ fn write_outputs(
Ok(())
}

/// Convert a PDF file using the appropriate backend.
///
/// On native builds, uses `edgeparse_core::convert()` which supports raster
/// table OCR via external tools. On WASI/non-native builds, reads the file
/// into memory and uses `convert_bytes()` instead (no external tool support).
fn convert_file(
input_path: &std::path::Path,
config: &edgeparse_core::api::config::ProcessingConfig,
) -> Result<edgeparse_core::models::document::PdfDocument, edgeparse_core::EdgePdfError> {
#[cfg(feature = "native")]
{
edgeparse_core::convert(input_path, config)
}
#[cfg(not(feature = "native"))]
{
let data = std::fs::read(input_path)?;
let file_name = input_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown.pdf");
edgeparse_core::convert_bytes(&data, file_name, config)
}
}

fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig {
use edgeparse_core::api::config::*;
use edgeparse_core::api::filter::FilterConfig;
Expand Down
2 changes: 2 additions & 0 deletions tests/wasm-runtimes/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Build artifacts β€” extracted binaries from Docker builds
.build/
68 changes: 68 additions & 0 deletions tests/wasm-runtimes/Dockerfile.build.riscv
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# ─── Reproducible RISC-V Cross-Compilation ──────────────────────────────────
# Greg's AI coding buddy:
# Cross-compiles edgeparse for riscv64gc-unknown-linux-gnu.
# Produces TWO binaries:
# 1. Dynamic-linked β†’ runs under QEMU user-mode (with sysroot)
# 2. Static-linked β†’ runs on Spike/libriscv/RVVM/CKB-VM (no sysroot)
#
# riscv64gc = RV64IMAFDC β€” the "general-purpose computing" profile
# that Linux distros target (Debian, Ubuntu, Fedora all ship riscv64gc).
#
# Usage:
# docker build -f tests/wasm-runtimes/Dockerfile.build.riscv \
# -t edgeparse-riscv-build .
# ─────────────────────────────────────────────────────────────────────────────

FROM rust:1-slim-bookworm AS builder

# Cross-compilation toolchain for RISC-V 64-bit
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc-riscv64-linux-gnu \
libc6-dev-riscv64-cross \
pkg-config \
&& rm -rf /var/lib/apt/lists/*

RUN rustup target add riscv64gc-unknown-linux-gnu

# Tell cargo which linker to use for the RISC-V target
ENV CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER=riscv64-linux-gnu-gcc

WORKDIR /build

# ── Cache cargo registry: copy manifests first ──────────────────────────────
COPY Cargo.toml Cargo.lock ./
COPY crates/edgeparse-core/Cargo.toml crates/edgeparse-core/
COPY crates/edgeparse-cli/Cargo.toml crates/edgeparse-cli/
COPY crates/pdf-cos/Cargo.toml crates/pdf-cos/

# Strip workspace members that need native platform SDKs
RUN sed -E -i \
'/"crates\/(edgeparse-python|edgeparse-node|edgeparse-wasm)"/d' \
Cargo.toml && \
mkdir -p crates/edgeparse-core/src crates/edgeparse-cli/src crates/pdf-cos/src && \
echo "fn main() {}" > crates/edgeparse-cli/src/main.rs && \
touch crates/edgeparse-core/src/lib.rs && \
touch crates/pdf-cos/src/lib.rs && \
cargo build --target riscv64gc-unknown-linux-gnu --release \
-p edgeparse-cli 2>/dev/null || true

# ── Copy real source and build (dynamic) ────────────────────────────────────
COPY crates/ crates/
RUN find crates/ -name '*.rs' -exec touch {} + && \
cargo build --target riscv64gc-unknown-linux-gnu --release \
-p edgeparse-cli

# Save dynamic binary before static build overwrites it
RUN cp target/riscv64gc-unknown-linux-gnu/release/edgeparse /tmp/edgeparse-dynamic

# ── Build statically-linked variant for Spike/libriscv/RVVM/CKB-VM ─────────
# These VMs have minimal/no dynamic linker β€” static linking is essential.
RUN RUSTFLAGS="-C target-feature=+crt-static" \
cargo build --target riscv64gc-unknown-linux-gnu --release \
-p edgeparse-cli && \
cp target/riscv64gc-unknown-linux-gnu/release/edgeparse /tmp/edgeparse-static

# ── Output stage ────────────────────────────────────────────────────────────
FROM scratch AS output
COPY --from=builder /tmp/edgeparse-dynamic /out/edgeparse
COPY --from=builder /tmp/edgeparse-static /out/edgeparse-static
47 changes: 47 additions & 0 deletions tests/wasm-runtimes/Dockerfile.build.wasix
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# ─── Reproducible WASIX Build ───────────────────────────────────────────────
# Greg's AI coding buddy:
# Builds edgeparse as a WASIX binary (.wasm) using cargo-wasix.
# WASIX = WASI + POSIX superpowers (threads, sockets, fork/exec).
# Only runs on Wasmer β€” the one runtime to rule them all (for WASIX).
#
# The target triple is wasm32-wasmer-wasi β€” a custom Wasmer target
# that extends wasm32-wasi with the full POSIX syscall surface.
#
# Usage:
# docker build -f tests/wasm-runtimes/Dockerfile.build.wasix \
# -t edgeparse-wasix-build .
# ─────────────────────────────────────────────────────────────────────────────

FROM rust:1-slim-bookworm AS builder

# cargo-wasix installs its own rustup toolchain + wasm32-wasmer-wasi target
RUN cargo install cargo-wasix
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cargo install cargo-wasix is unpinned, so the WASIX build can change/break over time depending on the latest published version. Consider pinning a specific cargo-wasix version (or installing from a known git ref) to keep the build reproducible.

Suggested change
RUN cargo install cargo-wasix
RUN cargo install cargo-wasix --version 0.16.3

Copilot uses AI. Check for mistakes.

WORKDIR /build

# ── Cache cargo registry: copy manifests first ──────────────────────────────
COPY Cargo.toml Cargo.lock ./
COPY crates/edgeparse-core/Cargo.toml crates/edgeparse-core/
COPY crates/edgeparse-cli/Cargo.toml crates/edgeparse-cli/
COPY crates/pdf-cos/Cargo.toml crates/pdf-cos/

# Strip workspace members that don't compile for WASIX
RUN sed -E -i \
'/"crates\/(edgeparse-python|edgeparse-node|edgeparse-wasm)"/d' \
Cargo.toml && \
mkdir -p crates/edgeparse-core/src crates/edgeparse-cli/src crates/pdf-cos/src && \
echo "fn main() {}" > crates/edgeparse-cli/src/main.rs && \
touch crates/edgeparse-core/src/lib.rs && \
touch crates/pdf-cos/src/lib.rs && \
cargo wasix build --release -p edgeparse-cli --no-default-features 2>/dev/null || true

# ── Copy real source and build ──────────────────────────────────────────────
COPY crates/ crates/
RUN find crates/ -name '*.rs' -exec touch {} + && \
cargo wasix build --release -p edgeparse-cli --no-default-features

# ── Output stage ────────────────────────────────────────────────────────────
FROM scratch AS output
COPY --from=builder \
/build/target/wasm32-wasmer-wasi/release/edgeparse.wasm \
/out/edgeparse-wasix.wasm
54 changes: 54 additions & 0 deletions tests/wasm-runtimes/Dockerfile.build.wasm
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# ─── Reproducible WASM (WASI) Build ─────────────────────────────────────────
# Greg's AI coding buddy:
# Builds edgeparse as a WASI Preview 1 binary (.wasm) that runs on
# any conformant runtime (wasmtime, wasmer, wasmedge, wamr).
#
# The wasm32-wasip1 target compiles edgeparse-core without rayon/image/zip
# (those need native threads or JS glue). Pure PDF parsing still works β€”
# you just lose parallelism and image extraction.
#
# Usage:
# docker build -f tests/wasm-runtimes/Dockerfile.build.wasm \
# -t edgeparse-wasi-build .
# # Extract the binary:
# id=$(docker create edgeparse-wasi-build) && \
# docker cp "$id":/out/edgeparse.wasm tests/wasm-runtimes/.build/ && \
# docker rm "$id"
# ─────────────────────────────────────────────────────────────────────────────

FROM rust:1-slim-bookworm AS builder

RUN rustup target add wasm32-wasip1

WORKDIR /build

# ── Cache cargo registry: copy manifests first ──────────────────────────────
COPY Cargo.toml Cargo.lock ./
COPY crates/edgeparse-core/Cargo.toml crates/edgeparse-core/
COPY crates/edgeparse-cli/Cargo.toml crates/edgeparse-cli/
COPY crates/pdf-cos/Cargo.toml crates/pdf-cos/

# Strip workspace members that don't compile for wasm32-wasip1
# (pyo3, napi-rs, wasm-bindgen are browser-only / native-only)
RUN sed -E -i \
'/"crates\/(edgeparse-python|edgeparse-node|edgeparse-wasm)"/d' \
Cargo.toml && \
mkdir -p crates/edgeparse-core/src crates/edgeparse-cli/src crates/pdf-cos/src && \
echo "fn main() {}" > crates/edgeparse-cli/src/main.rs && \
touch crates/edgeparse-core/src/lib.rs && \
touch crates/pdf-cos/src/lib.rs && \
cargo build --target wasm32-wasip1 --release \
-p edgeparse-cli --no-default-features 2>/dev/null || true
# ^^ dummy build warms the dep cache; may fail on empty libs β€” that's fine

# ── Copy real source and build ──────────────────────────────────────────────
COPY crates/ crates/
RUN find crates/ -name '*.rs' -exec touch {} + && \
cargo build --target wasm32-wasip1 --release \
-p edgeparse-cli --no-default-features

# ── Output stage β€” just the binary ──────────────────────────────────────────
FROM scratch AS output
COPY --from=builder \
/build/target/wasm32-wasip1/release/edgeparse.wasm \
/out/edgeparse.wasm
31 changes: 31 additions & 0 deletions tests/wasm-runtimes/Dockerfile.runner.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# ─── Shared Runner Base ─────────────────────────────────────────────────────
# Greg's AI coding buddy:
# Common base layer for all WASM runtime test containers.
# Build this FIRST β€” subsequent runtime images inherit its cached layers.
#
# docker build -f tests/wasm-runtimes/Dockerfile.runner.base \
# -t edgeparse-wasi-base .
# ─────────────────────────────────────────────────────────────────────────────

FROM ubuntu:24.04

# Shared dependencies β€” curl for runtime installers, ca-certs for HTTPS,
# xz-utils for compressed release tarballs, file for MIME sniffing
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
ca-certificates \
xz-utils \
file \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /test

# Copy test fixtures and scripts
COPY tests/fixtures/sample.pdf /test/fixtures/
COPY tests/wasm-runtimes/run-tests.sh /test/
RUN chmod +x /test/run-tests.sh

# The .wasm binary is expected at /test/edgeparse.wasm
# It gets copied in by each runtime Dockerfile or mounted at runtime.

ENTRYPOINT ["/test/run-tests.sh"]
46 changes: 46 additions & 0 deletions tests/wasm-runtimes/Dockerfile.runner.ckb-vm
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# ─── CKB-VM Runner ──────────────────────────────────────────────────────────
# Greg's AI coding buddy:
# CKB-VM β€” Production blockchain VM from Nervos Network.
# rv64imc ISA, W^X memory protection, gas metering, JIT compilation.
# 2.5x faster than Wasmer Singlepass. Deployed on CKB mainnet.
#
# Uses ckb-debugger to execute RISC-V ELF binaries with
# Linux syscall emulation. Limited POSIX support β€” designed for
# deterministic computation, not general-purpose Linux apps.
#
# EXPERIMENTAL: CKB-VM has limited syscall support. edgeparse's
# file I/O may not be fully supported. Tests may partially fail.
#
# docker build -f tests/wasm-runtimes/Dockerfile.runner.ckb-vm \
# -t edgeparse-riscv-ckb-vm .
# ─────────────────────────────────────────────────────────────────────────────

FROM rust:1-slim-bookworm AS builder

# Build ckb-standalone-debugger which can run RISC-V ELF binaries
RUN cargo install ckb-debugger 2>/dev/null || \
(apt-get update && apt-get install -y --no-install-recommends git pkg-config libssl-dev && \
cargo install --git https://github.com/nervosnetwork/ckb-standalone-debugger ckb-debugger)

FROM ubuntu:24.04

RUN apt-get update && apt-get install -y --no-install-recommends \
file \
&& rm -rf /var/lib/apt/lists/*

COPY --from=builder /usr/local/cargo/bin/ckb-debugger /usr/local/bin/

# Verify
RUN ckb-debugger --version 2>&1 || echo "CKB debugger installed"

WORKDIR /test

COPY tests/fixtures/sample.pdf /test/fixtures/
COPY tests/wasm-runtimes/run-tests.sh /test/
RUN chmod +x /test/run-tests.sh

# Copy pre-built RISC-V binary (statically linked)
COPY tests/wasm-runtimes/.build/edgeparse-riscv64-static /test/edgeparse-riscv64

ENTRYPOINT ["/test/run-tests.sh"]
CMD ["ckb-vm"]
Loading