Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions recipes/recipes_emscripten/sentencepiece/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash
set -exuo pipefail

# Set default values for potentially unset variables
EM_FORGE_SIDE_MODULE_CFLAGS="${EM_FORGE_SIDE_MODULE_CFLAGS:-}"
EM_FORGE_SIDE_MODULE_LDFLAGS="${EM_FORGE_SIDE_MODULE_LDFLAGS:-}"
CFLAGS="${CFLAGS:-}"
CXXFLAGS="${CXXFLAGS:-}"
LDFLAGS="${LDFLAGS:-}"

export CFLAGS="$CFLAGS $EM_FORGE_SIDE_MODULE_CFLAGS"
export CXXFLAGS="$CXXFLAGS $EM_FORGE_SIDE_MODULE_CFLAGS"
export LDFLAGS="$LDFLAGS $EM_FORGE_SIDE_MODULE_LDFLAGS"

mkdir -p build
cd build

emcmake cmake -GNinja \
${CMAKE_ARGS} \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_STANDARD=17 \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_INSTALL_PREFIX="${PREFIX}" \
-DCMAKE_PREFIX_PATH="${PREFIX}" \
-DSPM_ENABLE_SHARED=OFF \
-DSPM_BUILD_TEST=OFF \
-DSPM_ENABLE_TCMALLOC=OFF \
-DSPM_ABSL_PROVIDER=package \
-DSPM_PROTOBUF_PROVIDER=internal \
-DSPM_ENABLE_NFKC_COMPILE=OFF \
..

# Build only the static library (CLI tools fail to link due to duplicate
# abseil symbols between internal protobuf-lite and system libabseil)
ninja sentencepiece-static

# Install library and headers manually
mkdir -p "${PREFIX}/lib"
mkdir -p "${PREFIX}/include"
cp src/libsentencepiece.a "${PREFIX}/lib/"
cp ../src/sentencepiece_processor.h "${PREFIX}/include/"
cp ../src/sentencepiece_trainer.h "${PREFIX}/include/"

# Remove .la files if any
find "${PREFIX}" -name '*.la' -delete 2>/dev/null || true

# Install CMake config files
mkdir -p "${PREFIX}/lib/cmake/sentencepiece"

cp "${RECIPE_DIR}/sentencepieceConfigVersion.cmake" "${PREFIX}/lib/cmake/sentencepiece/"
sed -i "s/@PKG_VERSION@/${PKG_VERSION}/" "${PREFIX}/lib/cmake/sentencepiece/sentencepieceConfigVersion.cmake"
cp "${RECIPE_DIR}/sentencepieceConfig.cmake" "${PREFIX}/lib/cmake/sentencepiece/"
cp "${RECIPE_DIR}/sentencepieceTargets.cmake" "${PREFIX}/lib/cmake/sentencepiece/"
16 changes: 16 additions & 0 deletions recipes/recipes_emscripten/sentencepiece/build_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
set -e

export CFLAGS="${CFLAGS}"
export CXXFLAGS="${CXXFLAGS}"

emcmake cmake -S tests -B build_tests \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH="${PREFIX}" \
-DCMAKE_FIND_ROOT_PATH="${PREFIX}" \
-DCMAKE_CXX_STANDARD=17

emmake make -C build_tests -j"${CPU_COUNT}"

echo "Running test..."
node build_tests/test_sentencepiece_cpp.js
67 changes: 67 additions & 0 deletions recipes/recipes_emscripten/sentencepiece/recipe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
context:
name: sentencepiece
version: 0.2.1

package:
name: ${{ name }}
version: ${{ version }}

source:
url: https://github.com/google/sentencepiece/archive/refs/tags/v${{ version }}.tar.gz
sha256: c1a59e9259c9653ad0ade653dadff074cd31f0a6ff2a11316f67bee4189a8f1b

build:
number: 0
script: build.sh

requirements:
build:
- ${{ compiler('c') }}
- ${{ compiler('cxx') }}
- cmake
- ninja
host:
- libabseil
run:
- libabseil

tests:
- package_contents:
lib:
- libsentencepiece.a
include:
- sentencepiece_processor.h
- sentencepiece_trainer.h
files:
- lib/cmake/sentencepiece/sentencepieceConfig.cmake
- lib/cmake/sentencepiece/sentencepieceConfigVersion.cmake
- script:
- build_tests.sh
requirements:
build:
- ${{ compiler('cxx') }}
- cmake
- ninja
files:
recipe:
- build_tests.sh
- tests/

about:
homepage: https://github.com/google/sentencepiece
license: Apache-2.0
license_family: Apache
license_file: LICENSE
summary: Unsupervised text tokenizer for Neural Network-based text generation
description: |
SentencePiece is an unsupervised text tokenizer and detokenizer mainly for
Neural Network-based text generation systems where the vocabulary size is
predetermined prior to the neural model training. It implements subword
units (BPE and unigram language model) with the extension of direct training
from raw sentences.
documentation: https://github.com/google/sentencepiece
repository: https://github.com/google/sentencepiece

extra:
recipe-maintainers:
- Alex-PLACET
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include(CMakeFindDependencyMacro)
find_dependency(absl REQUIRED)
include("${CMAKE_CURRENT_LIST_DIR}/sentencepieceTargets.cmake")
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
set(PACKAGE_VERSION "@PKG_VERSION@")

if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
set(PACKAGE_VERSION_COMPATIBLE FALSE)
else()
if(PACKAGE_VERSION MATCHES "^([0-9]+)\\.")
set(CVF_VERSION_MAJOR "${CMAKE_MATCH_1}")
else()
set(CVF_VERSION_MAJOR "${PACKAGE_VERSION}")
endif()

if(PACKAGE_FIND_VERSION_MAJOR STREQUAL CVF_VERSION_MAJOR)
set(PACKAGE_VERSION_COMPATIBLE TRUE)
else()
set(PACKAGE_VERSION_COMPATIBLE FALSE)
endif()

if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
set(PACKAGE_VERSION_EXACT TRUE)
endif()
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
if(NOT TARGET sentencepiece::sentencepiece)
add_library(sentencepiece::sentencepiece STATIC IMPORTED)
set_target_properties(sentencepiece::sentencepiece PROPERTIES
IMPORTED_LOCATION "${CMAKE_CURRENT_LIST_DIR}/../../libsentencepiece.a"
INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_LIST_DIR}/../../../include"
INTERFACE_LINK_LIBRARIES "absl::base;absl::flags;absl::flags_parse;absl::flat_hash_map;absl::flat_hash_set;absl::log;absl::random_random;absl::status;absl::statusor;absl::str_format;absl::strings"
)
endif()
10 changes: 10 additions & 0 deletions recipes/recipes_emscripten/sentencepiece/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cmake_minimum_required(VERSION 3.16)
project(test_sentencepiece_cpp CXX)
set(CMAKE_CXX_STANDARD 17)

find_package(sentencepiece REQUIRED)

add_executable(test_sentencepiece_cpp test_sentencepiece.cpp)
target_link_libraries(test_sentencepiece_cpp PRIVATE
sentencepiece::sentencepiece
)
116 changes: 116 additions & 0 deletions recipes/recipes_emscripten/sentencepiece/tests/test_sentencepiece.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include <sentencepiece_processor.h>
#include <cassert>
#include <cstring>
#include <iostream>
#include <string>
#include <vector>

// Auto-generated tiny SentencePiece BPE model for testing
// Model size: 648 bytes
static const unsigned char kTestModelData[] = {
0x0a, 0x0e, 0x0a, 0x05, 0x3c, 0x75, 0x6e, 0x6b, 0x3e, 0x15, 0x00, 0x00,
0x00, 0x00, 0x18, 0x02, 0x0a, 0x0c, 0x0a, 0x03, 0x3c, 0x73, 0x3e, 0x15,
0x00, 0x00, 0x00, 0x00, 0x18, 0x03, 0x0a, 0x0d, 0x0a, 0x04, 0x3c, 0x2f,
0x73, 0x3e, 0x15, 0x00, 0x00, 0x00, 0x00, 0x18, 0x03, 0x0a, 0x0b, 0x0a,
0x04, 0xe2, 0x96, 0x81, 0x62, 0x15, 0x00, 0x00, 0x00, 0x80, 0x0a, 0x0b,
0x0a, 0x04, 0xe2, 0x96, 0x81, 0x67, 0x15, 0x00, 0x00, 0x80, 0xbf, 0x0a,
0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x6d, 0x15, 0x00, 0x00, 0x00, 0xc0,
0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x73, 0x15, 0x00, 0x00, 0x40,
0xc0, 0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x77, 0x15, 0x00, 0x00,
0x80, 0xc0, 0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x79, 0x15, 0x00,
0x00, 0xa0, 0xc0, 0x0a, 0x09, 0x0a, 0x02, 0x61, 0x62, 0x15, 0x00, 0x00,
0xc0, 0xc0, 0x0a, 0x09, 0x0a, 0x02, 0x65, 0x66, 0x15, 0x00, 0x00, 0xe0,
0xc0, 0x0a, 0x09, 0x0a, 0x02, 0x6b, 0x6c, 0x15, 0x00, 0x00, 0x00, 0xc1,
0x0a, 0x09, 0x0a, 0x02, 0x71, 0x72, 0x15, 0x00, 0x00, 0x10, 0xc1, 0x0a,
0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x63, 0x15, 0x00, 0x00, 0x20, 0xc1,
0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x64, 0x15, 0x00, 0x00, 0x30,
0xc1, 0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x66, 0x15, 0x00, 0x00,
0x40, 0xc1, 0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x69, 0x15, 0x00,
0x00, 0x50, 0xc1, 0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x6a, 0x15,
0x00, 0x00, 0x60, 0xc1, 0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81, 0x6f,
0x15, 0x00, 0x00, 0x70, 0xc1, 0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96, 0x81,
0x70, 0x15, 0x00, 0x00, 0x80, 0xc1, 0x0a, 0x0b, 0x0a, 0x04, 0xe2, 0x96,
0x81, 0x75, 0x15, 0x00, 0x00, 0x88, 0xc1, 0x0a, 0x0b, 0x0a, 0x04, 0xe2,
0x96, 0x81, 0x76, 0x15, 0x00, 0x00, 0x90, 0xc1, 0x0a, 0x0c, 0x0a, 0x05,
0xe2, 0x96, 0x81, 0x62, 0x61, 0x15, 0x00, 0x00, 0x98, 0xc1, 0x0a, 0x0a,
0x0a, 0x03, 0xe2, 0x96, 0x81, 0x15, 0x00, 0x00, 0xa0, 0xc1, 0x0a, 0x08,
0x0a, 0x01, 0x6f, 0x15, 0x00, 0x00, 0xa8, 0xc1, 0x0a, 0x08, 0x0a, 0x01,
0x6c, 0x15, 0x00, 0x00, 0xb0, 0xc1, 0x0a, 0x08, 0x0a, 0x01, 0x61, 0x15,
0x00, 0x00, 0xb8, 0xc1, 0x0a, 0x08, 0x0a, 0x01, 0x62, 0x15, 0x00, 0x00,
0xc0, 0xc1, 0x0a, 0x08, 0x0a, 0x01, 0x72, 0x15, 0x00, 0x00, 0xc8, 0xc1,
0x0a, 0x08, 0x0a, 0x01, 0x64, 0x15, 0x00, 0x00, 0xd0, 0xc1, 0x0a, 0x08,
0x0a, 0x01, 0x65, 0x15, 0x00, 0x00, 0xd8, 0xc1, 0x0a, 0x08, 0x0a, 0x01,
0x66, 0x15, 0x00, 0x00, 0xe0, 0xc1, 0x0a, 0x08, 0x0a, 0x01, 0x68, 0x15,
0x00, 0x00, 0xe8, 0xc1, 0x0a, 0x08, 0x0a, 0x01, 0x77, 0x15, 0x00, 0x00,
0xf0, 0xc1, 0x0a, 0x08, 0x0a, 0x01, 0x7a, 0x15, 0x00, 0x00, 0xf8, 0xc1,
0x0a, 0x08, 0x0a, 0x01, 0x63, 0x15, 0x00, 0x00, 0x00, 0xc2, 0x0a, 0x08,
0x0a, 0x01, 0x67, 0x15, 0x00, 0x00, 0x04, 0xc2, 0x0a, 0x08, 0x0a, 0x01,
0x69, 0x15, 0x00, 0x00, 0x08, 0xc2, 0x0a, 0x08, 0x0a, 0x01, 0x6a, 0x15,
0x00, 0x00, 0x0c, 0xc2, 0x0a, 0x08, 0x0a, 0x01, 0x6b, 0x15, 0x00, 0x00,
0x10, 0xc2, 0x0a, 0x08, 0x0a, 0x01, 0x6d, 0x15, 0x00, 0x00, 0x14, 0xc2,
0x0a, 0x08, 0x0a, 0x01, 0x6e, 0x15, 0x00, 0x00, 0x18, 0xc2, 0x0a, 0x08,
0x0a, 0x01, 0x70, 0x15, 0x00, 0x00, 0x1c, 0xc2, 0x0a, 0x08, 0x0a, 0x01,
0x71, 0x15, 0x00, 0x00, 0x20, 0xc2, 0x0a, 0x08, 0x0a, 0x01, 0x73, 0x15,
0x00, 0x00, 0x24, 0xc2, 0x0a, 0x08, 0x0a, 0x01, 0x74, 0x15, 0x00, 0x00,
0x28, 0xc2, 0x0a, 0x08, 0x0a, 0x01, 0x75, 0x15, 0x00, 0x00, 0x2c, 0xc2,
0x0a, 0x08, 0x0a, 0x01, 0x76, 0x15, 0x00, 0x00, 0x30, 0xc2, 0x0a, 0x08,
0x0a, 0x01, 0x78, 0x15, 0x00, 0x00, 0x34, 0xc2, 0x0a, 0x08, 0x0a, 0x01,
0x79, 0x15, 0x00, 0x00, 0x38, 0xc2, 0x12, 0x3a, 0x0a, 0x13, 0x2f, 0x74,
0x6d, 0x70, 0x2f, 0x73, 0x70, 0x6d, 0x5f, 0x63, 0x6f, 0x72, 0x70, 0x75,
0x73, 0x2e, 0x74, 0x78, 0x74, 0x12, 0x0d, 0x2f, 0x74, 0x6d, 0x70, 0x2f,
0x73, 0x70, 0x6d, 0x5f, 0x74, 0x69, 0x6e, 0x79, 0x18, 0x02, 0x20, 0x32,
0x55, 0x00, 0x00, 0x80, 0x3f, 0x58, 0xe8, 0x07, 0x80, 0x01, 0x01, 0x88,
0x01, 0x01, 0x90, 0x01, 0xe8, 0x07, 0x1a, 0x10, 0x0a, 0x08, 0x69, 0x64,
0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x12, 0x00, 0x18, 0x00, 0x20, 0x01,
};
static const unsigned int kTestModelSize = 648;

int main() {
sentencepiece::SentencePieceProcessor processor;

// 1. Load model from embedded serialized proto
std::string model_str(reinterpret_cast<const char*>(kTestModelData), kTestModelSize);
auto status = processor.LoadFromSerializedProto(model_str);
assert(status.ok());

// 2. Check vocab size
int vocab_size = processor.GetPieceSize();
assert(vocab_size > 0);
std::cout << "Vocab size: " << vocab_size << std::endl;

// 3. Test id_to_piece and piece_to_id for known token
// The <unk> token should exist
int unk_id = processor.unk_id();
assert(unk_id >= 0);
assert(processor.IdToPiece(unk_id) == "<unk>");

// 4. Encode some text containing known characters
std::vector<int> ids;
status = processor.Encode("abc", &ids);
assert(status.ok());
assert(!ids.empty());
std::cout << "Encode('abc') -> " << ids.size() << " tokens" << std::endl;

// 5. Decode back
std::string decoded;
status = processor.Decode(ids, &decoded);
assert(status.ok());
std::cout << "Decode -> '" << decoded << "'" << std::endl;

// 6. Test with text containing known merge pairs (like "ab")
status = processor.Encode("ab", &ids);
assert(status.ok());
std::cout << "Encode('ab') -> " << ids.size() << " tokens" << std::endl;

status = processor.Decode(ids, &decoded);
assert(status.ok());
std::cout << "Decode -> '" << decoded << "'" << std::endl;

// 8. Test piece_size and basic model properties
int bos_id = processor.bos_id();
int eos_id = processor.eos_id();
std::cout << "BOS id: " << bos_id << ", EOS id: " << eos_id << std::endl;

std::cout << "sentencepiece-cpp test passed!" << std::endl;
return 0;
}