Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
*.DS_Store
*build
.results
*.plan
onnx_weights/
test_images/
31 changes: 28 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,11 @@ docker run -it --rm \
```bash
python python/onnxexport.py
```
This produces `onnx_weights/sam3_static.onnx` plus external weight shards.
This produces `onnx_weights/sam3_dynamic.onnx` plus external weight shards.

5) Build a TensorRT engine
```bash
trtexec --onnx=onnx_weights/sam3_static.onnx --saveEngine=sam3_fp16.plan --fp16 --verbose
trtexec --onnx=onnx_weights/sam3_dynamic.onnx --saveEngine=sam3_fp16.plan --fp16 --verbose
```

6) Build the C++/CUDA library and sample app
Expand Down Expand Up @@ -181,4 +181,29 @@ TensorRT + CUDA (benchmark mode disables output writes):
If this saved you time, drop a ⭐ so others can find it and ship SAM-3 faster.

# Disclaimer
All views expressed here are my own. This project is not affiliated with my employer.
All views expressed here are my own. This project is not affiliated with my employer.

## Dynamic Bounding Box Detection (New!)
The application has been extended to support **Native Bounding Box Detection** directly from the SAM3 model outputs, as well as **Dynamic Text Prompting** without hardcoded tokens.

### Setup Tokenizer
Because the C++ application relies on HuggingFace tokenization, you first must export the tokenizer files:
```bash
python3 python/export_tokenizer.py
```
*This will create `tokenizer.json` in the `onnx_weights/` directory for the Python script to use.*

### Run Bounding Box Visualization
Run the application with your target prompt as the 3rd argument. The C++ application will dynamically tokenize the prompt, run the TensorRT engine, and draw green bounding boxes with the text label above them.
```bash
cd /workspace/cpp/build
make -j
./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan "helmet"
```

### Benchmark Bounding Box Inference
To test the raw speed of the `.plan` engine executing the prompt and calculating bounding boxes (without the latency of OpenCV drawing and saving the images), append `1` to the end of the command:
```bash
cd /workspace/cpp/build
./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan "helmet" 1
```
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ include_directories(
${CUDNN_ROOT_DIR}/include
)



add_library(sam3_trt SHARED
src/sam3/sam3_trt/sam3.cu
src/sam3/sam3_trt/prepost.cu
Expand Down
14 changes: 14 additions & 0 deletions cpp/include/prepost.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ __global__ void draw_instance_seg_mask(
float3* color_palette
);

__global__ void draw_bounding_box(
float* boxes,
float* logits,
uint8_t* result,
int src_width,
int src_height,
int src_channels,
int max_boxes,
int box_idx,
float prob_threshold,
float3* color_palette,
int thickness
);

static std::vector<float3> colpal = {
make_float3( 0, 185, 118), // teal (your original)
make_float3(230, 159, 0), // orange
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/sam3.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ private:
cudaStream_t sam3_stream;
dim3 bsize;
dim3 gsize;
int in_width, in_height, opencv_inbytes;
int in_width, in_height, opencv_inbytes = 0;

std::vector<void*> input_cpu;
std::vector<void*> input_gpu;
Expand Down
3 changes: 2 additions & 1 deletion cpp/include/sam3.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ typedef enum {
typedef enum {
VIS_NONE,
VIS_SEMANTIC_SEGMENTATION,
VIS_INSTANCE_SEGMENTATION
VIS_INSTANCE_SEGMENTATION,
VIS_BBOX
} SAM3_VISUALIZATION;

typedef struct {
Expand Down
177 changes: 114 additions & 63 deletions cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,63 @@
#include <chrono>
#include <thread>
#include <opencv2/imgproc.hpp>

void read_image_into_buffer(const std::string imgpath, char* raw_buffer, cv::Mat& buffer)
{
size_t file_size = std::filesystem::file_size(imgpath);
if (file_size==0)
{
std::stringstream err;
err << "Image file is empty";
throw std::runtime_error(err.str());
#include <fstream>
#include <array>
#include <memory>
#include <stdexcept>

// Helper to execute bash command and read stdout
std::string exec_python_tokenizer(const std::string& prompt) {
// Assuming the app is run from /workspace/cpp/build or /workspace
std::string cmd = "python3 /workspace/python/tokenize_prompt.py \"" + prompt + "\"";
std::array<char, 128> buffer;
std::string result;
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
if (!pipe) {
throw std::runtime_error("popen() failed!");
}

std::ifstream file(imgpath, std::ios::binary);

if (!file.is_open())
{
std::stringstream err;
err << "File " << imgpath << " could not be opened. Please check permissions\n";
throw std::runtime_error(err.str());
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
result += buffer.data();
}

file.read(raw_buffer, file_size);
file.close();

cv::Mat raw_mat(1, static_cast<int>(file_size), CV_8UC1, raw_buffer);
// just a wrapper, minimal allocation

cv::imdecode(raw_mat, cv::IMREAD_COLOR, &buffer);
// Remove trailing newlines
result.erase(std::remove(result.begin(), result.end(), '\n'), result.end());
return result;
}

void infer_one_image(SAM3_PCS& pcs,
const cv::Mat& img,
cv::Mat& result,
const SAM3_VISUALIZATION vis,
const std::string outfile,
const std::string prompt,
bool benchmark_run)
{
bool success = pcs.infer_on_image(img, result, vis);

if (benchmark_run)
if (benchmark_run) return;

if (vis == SAM3_VISUALIZATION::VIS_BBOX)
{
return;
// CPU-side box coordinates and logits copied over by sam3.cu
float* boxes = static_cast<float*>(pcs.output_cpu[2]);
float* logits = static_cast<float*>(pcs.output_cpu[3]);
int num_boxes = 200;

for (int i=0; i < num_boxes; i++) {
float logit = logits[i];
float prob = 1.0f / (1.0f + std::exp(-logit));
if (prob > 0.5f) { // threshold match
float x1 = boxes[i * 4 + 0];
float y1 = boxes[i * 4 + 1];
int x_min = std::max(0, (int)(x1 * img.cols));
int y_min = std::max(0, (int)(y1 * img.rows));

// Draw text slightly above the bounding box
// Reduced font scale from 0.9 to 0.5, thickness from 2 to 1
cv::putText(result, prompt, cv::Point(x_min, std::max(15, y_min - 6)),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1);
}
}
}

if (vis == SAM3_VISUALIZATION::VIS_NONE)
Expand All @@ -59,88 +75,123 @@ void infer_one_image(SAM3_PCS& pcs,

int main(int argc, char* argv[])
{
if (argc < 3)
if (argc < 4)
{
std::cout << "Usage: ./sam3_pcs_app indir engine_path.engine <benchmark=false>" << std::endl;
std::cout << "Usage: ./sam3_pcs_app indir engine_path.engine prompt <benchmark=0>" << std::endl;
return 0;
}

const std::string in_dir = argv[1];
std::string epath = argv[2];
bool benchmark=false; // in benchmarking mode we dont save output images
std::string prompt = argv[3];
bool benchmark = false; // in benchmarking mode we dont save output images

if (argc==4)
if (argc == 5)
{
std::string b_arg = argv[3]; // should be 0 or 1
try
{
benchmark = (b_arg == "1");
}
catch(const std::exception)
{
std::cout << "Unrecognized benchmark type " << argv[3] << std::endl;
}
benchmark = (std::string(argv[4]) == "1");
}
std::cout << "Target Prompt: " << prompt << std::endl;
std::cout << "Benchmarking: " << benchmark << std::endl;

auto start = std::chrono::system_clock::now();
auto end = std::chrono::system_clock::now();
std::chrono::duration<float> diff;
float millis_elapsed = 0.0; // int will overflow after ~650 hours
float millis_elapsed = 0.0;

const float vis_alpha = 0.3;
const float probability_threshold = 0.5;
const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION;
const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_BBOX;

SAM3_PCS pcs(epath, vis_alpha, probability_threshold);

cv::Mat img, result;
char* raw_bytes;
int prev_width = 0, prev_height = 0;

std::filesystem::create_directories("results");
int num_images_read=0;

// tokenized version of 'person'
std::vector<int64_t> iid={49406, 2533, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
49407, 49407};

std::vector<int64_t> iam={1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0};
// Tokenize the prompt
std::vector<int64_t> iid(32, 49407); // 49407 is usually the PAD/EOS token for SAM3 text encoder
std::vector<int64_t> iam(32, 0);

try {
std::cout << "Calling Python to tokenize prompt: '" << prompt << "'..." << std::endl;
std::string py_out = exec_python_tokenizer(prompt);

if (py_out == "-1" || py_out.empty()) {
throw std::runtime_error("Python tokenizer script failed.");
}

// Parse comma separated string
std::vector<int32_t> ids;
std::stringstream ss(py_out);
std::string token;
while (std::getline(ss, token, ',')) {
ids.push_back(std::stoi(token));
}

for (size_t i = 0; i < ids.size() && i < 32; ++i) {
iid[i] = ids[i];
iam[i] = 1; // 1 for real tokens, 0 for pad
}
std::cout << "Successfully tokenized prompt into " << ids.size() << " tokens." << std::endl;

} catch (const std::exception& e) {
std::cerr << "Tokenizer error: " << e.what() << std::endl;
std::cout << "Falling back to 'person' tokens.\n";
iid = {49406, 2533, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
49407, 49407};
iam = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0};
}

pcs.set_prompt(iid, iam);

const int MAX_BENCHMARK_IMAGES = 100;

for (const auto& fname : std::filesystem::directory_iterator(in_dir))
{
if (std::filesystem::is_regular_file(fname.path()))
{
std::filesystem::path outfile = std::filesystem::path("results") / fname.path().filename();

if (num_images_read==0)
img = cv::imread(fname.path(), cv::IMREAD_COLOR);
if (img.empty()) continue;

result.create(img.rows, img.cols, img.type());

if (img.cols != prev_width || img.rows != prev_height)
{
cv::Mat tmp = cv::imread(fname.path(), cv::IMREAD_COLOR);
raw_bytes = (char *)malloc(tmp.total()*tmp.elemSize());
read_image_into_buffer(fname.path(), raw_bytes, img);
result = cv::imread(fname.path(), cv::IMREAD_COLOR);
pcs.pin_opencv_matrices(img, result);
prev_width = img.cols;
prev_height = img.rows;
}
else
{
read_image_into_buffer(fname.path(), raw_bytes, img);
}

start = std::chrono::system_clock::now();
infer_one_image(pcs, img, result, visualize, outfile, benchmark);
infer_one_image(pcs, img, result, visualize, outfile, prompt, benchmark);
num_images_read++;
end = std::chrono::system_clock::now();
diff = end - start;
millis_elapsed += (diff.count() * 1000);

if (num_images_read>0 && num_images_read%10==0)
{
float msec_per_image = millis_elapsed/num_images_read;
printf("Processed %d images at %f msec/image\n", num_images_read, msec_per_image);
printf("Processed %d images...\n", num_images_read);
}

if (num_images_read >= MAX_BENCHMARK_IMAGES) break;
}
}

if (num_images_read > 0)
{
float msec_per_image = millis_elapsed/num_images_read;
float est_1000_min = msec_per_image * 1000.0f / 1000.0f / 60.0f;
printf("\n=== Benchmark Results ===\n");
printf("Processed %d images in %.1f s\n", num_images_read, millis_elapsed/1000.0f);
printf("Average: %.2f msec/image\n", msec_per_image);
printf("Estimated time for 1000 images: %.1f min\n", est_1000_min);
}
}
Loading