dataplayer12 · Priyadarshini75 · Mar 12, 2026 · Mar 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 *.DS_Store
 *build
+.results
+*.plan
+onnx_weights/
+test_images/
diff --git a/README.md b/README.md
@@ -112,11 +112,11 @@ docker run -it --rm \
 ```bash
 python python/onnxexport.py
 ```
-This produces `onnx_weights/sam3_static.onnx` plus external weight shards.
+This produces `onnx_weights/sam3_dynamic.onnx` plus external weight shards.
 
 5) Build a TensorRT engine
 ```bash
-trtexec --onnx=onnx_weights/sam3_static.onnx --saveEngine=sam3_fp16.plan --fp16 --verbose
+trtexec --onnx=onnx_weights/sam3_dynamic.onnx --saveEngine=sam3_fp16.plan --fp16 --verbose
 ```
 
 6) Build the C++/CUDA library and sample app
@@ -181,4 +181,29 @@ TensorRT + CUDA (benchmark mode disables output writes):
 If this saved you time, drop a ⭐ so others can find it and ship SAM-3 faster.
 
 # Disclaimer
-All views expressed here are my own. This project is not affiliated with my employer.
+All views expressed here are my own. This project is not affiliated with my employer.
+
+## Dynamic Bounding Box Detection (New!)
+The application has been extended to support **Native Bounding Box Detection** directly from the SAM3 model outputs, as well as **Dynamic Text Prompting** without hardcoded tokens.
+
+### Setup Tokenizer
+Because the C++ application relies on HuggingFace tokenization, you first must export the tokenizer files:
+```bash
+python3 python/export_tokenizer.py
+```
+*This will create `tokenizer.json` in the `onnx_weights/` directory for the Python script to use.*
+
+### Run Bounding Box Visualization
+Run the application with your target prompt as the 3rd argument. The C++ application will dynamically tokenize the prompt, run the TensorRT engine, and draw green bounding boxes with the text label above them.
+```bash
+cd /workspace/cpp/build
+make -j
+./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan "helmet"
+```
+
+### Benchmark Bounding Box Inference
+To test the raw speed of the `.plan` engine executing the prompt and calculating bounding boxes (without the latency of OpenCV drawing and saving the images), append `1` to the end of the command:
+```bash
+cd /workspace/cpp/build
+./sam3_pcs_app /workspace/test_images /workspace/sam3_fp16.plan "helmet" 1
+```
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -30,6 +30,8 @@ include_directories(
     ${CUDNN_ROOT_DIR}/include
 )
 
+
+
 add_library(sam3_trt SHARED 
     src/sam3/sam3_trt/sam3.cu
     src/sam3/sam3_trt/prepost.cu

diff --git a/cpp/include/prepost.cuh b/cpp/include/prepost.cuh
@@ -51,6 +51,20 @@ __global__ void draw_instance_seg_mask(
     float3* color_palette
 );
 
+__global__ void draw_bounding_box(
+    float* boxes,
+    float* logits,
+    uint8_t* result,
+    int src_width,
+    int src_height,
+    int src_channels,
+    int max_boxes,
+    int box_idx,
+    float prob_threshold,
+    float3* color_palette,
+    int thickness
+);
+
 static std::vector<float3> colpal = {
     make_float3(  0, 185, 118), // teal (your original)
     make_float3(230, 159,   0), // orange

diff --git a/cpp/include/sam3.cuh b/cpp/include/sam3.cuh
@@ -52,7 +52,7 @@ private:
     cudaStream_t sam3_stream;
     dim3 bsize;
     dim3 gsize;
-    int in_width, in_height, opencv_inbytes;
+    int in_width, in_height, opencv_inbytes = 0;
 
     std::vector<void*> input_cpu;
     std::vector<void*> input_gpu;

diff --git a/cpp/include/sam3.hpp b/cpp/include/sam3.hpp
@@ -15,7 +15,8 @@ typedef enum {
 typedef enum {
     VIS_NONE,
     VIS_SEMANTIC_SEGMENTATION,
-    VIS_INSTANCE_SEGMENTATION
+    VIS_INSTANCE_SEGMENTATION,
+    VIS_BBOX
 } SAM3_VISUALIZATION;
 
 typedef struct {

diff --git a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
@@ -3,47 +3,63 @@
 #include <chrono>
 #include <thread>
 #include <opencv2/imgproc.hpp>
-
-void read_image_into_buffer(const std::string imgpath, char* raw_buffer, cv::Mat& buffer)
-{
-    size_t file_size = std::filesystem::file_size(imgpath);
-    if (file_size==0)
-    {
-        std::stringstream err;
-        err << "Image file is empty";
-        throw std::runtime_error(err.str());
+#include <fstream>
+#include <array>
+#include <memory>
+#include <stdexcept>
+
+// Helper to execute bash command and read stdout
+std::string exec_python_tokenizer(const std::string& prompt) {
+    // Assuming the app is run from /workspace/cpp/build or /workspace
+    std::string cmd = "python3 /workspace/python/tokenize_prompt.py \"" + prompt + "\"";
+    std::array<char, 128> buffer;
+    std::string result;
+    std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
+    if (!pipe) {
+        throw std::runtime_error("popen() failed!");
     }
-
-    std::ifstream file(imgpath, std::ios::binary);
-
-    if (!file.is_open())
-    {
-        std::stringstream err;
-        err << "File " << imgpath << " could not be opened. Please check permissions\n";
-        throw std::runtime_error(err.str());
+    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+        result += buffer.data();
     }
-
-    file.read(raw_buffer, file_size);
-    file.close();
-
-    cv::Mat raw_mat(1, static_cast<int>(file_size), CV_8UC1, raw_buffer); 
-    // just a wrapper, minimal allocation
-
-    cv::imdecode(raw_mat, cv::IMREAD_COLOR, &buffer);
+    // Remove trailing newlines
+    result.erase(std::remove(result.begin(), result.end(), '\n'), result.end());
+    return result;
 }
 
 void infer_one_image(SAM3_PCS& pcs, 
     const cv::Mat& img, 
     cv::Mat& result, 
     const SAM3_VISUALIZATION vis,
     const std::string outfile,
+    const std::string prompt,
     bool benchmark_run)
 {
     bool success = pcs.infer_on_image(img, result, vis);
 
-    if (benchmark_run)
+    if (benchmark_run) return;
+
+    if (vis == SAM3_VISUALIZATION::VIS_BBOX)
     {
-        return;
+        // CPU-side box coordinates and logits copied over by sam3.cu
+        float* boxes = static_cast<float*>(pcs.output_cpu[2]);
+        float* logits = static_cast<float*>(pcs.output_cpu[3]);
+        int num_boxes = 200;
+
+        for (int i=0; i < num_boxes; i++) {
+            float logit = logits[i];
+            float prob = 1.0f / (1.0f + std::exp(-logit));
+            if (prob > 0.5f) { // threshold match
+                float x1 = boxes[i * 4 + 0];
+                float y1 = boxes[i * 4 + 1];
+                int x_min = std::max(0, (int)(x1 * img.cols));
+                int y_min = std::max(0, (int)(y1 * img.rows));
+
+                // Draw text slightly above the bounding box
+                // Reduced font scale from 0.9 to 0.5, thickness from 2 to 1
+                cv::putText(result, prompt, cv::Point(x_min, std::max(15, y_min - 6)), 
+                            cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1);
+            }
+        }
     }
 
     if (vis == SAM3_VISUALIZATION::VIS_NONE)
@@ -59,88 +75,123 @@ void infer_one_image(SAM3_PCS& pcs,
 
 int main(int argc, char* argv[])
 {
-    if (argc < 3)
+    if (argc < 4)
     {
-        std::cout << "Usage: ./sam3_pcs_app indir engine_path.engine <benchmark=false>" << std::endl;
+        std::cout << "Usage: ./sam3_pcs_app indir engine_path.engine prompt <benchmark=0>" << std::endl;
         return 0;
     }
 
     const std::string in_dir = argv[1];
     std::string epath = argv[2];
-    bool benchmark=false; // in benchmarking mode we dont save output images
+    std::string prompt = argv[3];
+    bool benchmark = false; // in benchmarking mode we dont save output images
 
-    if (argc==4)
+    if (argc == 5)
     {
-        std::string b_arg = argv[3]; // should be 0 or 1
-        try
-        {
-            benchmark = (b_arg == "1");
-        }
-        catch(const std::exception)
-        {
-            std::cout << "Unrecognized benchmark type " << argv[3] << std::endl;
-        }
+        benchmark = (std::string(argv[4]) == "1");
     }
+    std::cout << "Target Prompt: " << prompt << std::endl;
     std::cout << "Benchmarking: " << benchmark << std::endl;
 
     auto start = std::chrono::system_clock::now();
     auto end = std::chrono::system_clock::now();
     std::chrono::duration<float> diff;
-    float millis_elapsed = 0.0; // int will overflow after ~650 hours
+    float millis_elapsed = 0.0;
 
     const float vis_alpha = 0.3;
     const float probability_threshold = 0.5;
-    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION;
+    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_BBOX;
 
     SAM3_PCS pcs(epath, vis_alpha, probability_threshold);
 
     cv::Mat img, result;
-    char* raw_bytes;
+    int prev_width = 0, prev_height = 0;
 
     std::filesystem::create_directories("results");
     int num_images_read=0;
 
-    // tokenized version of 'person'
-    std::vector<int64_t> iid={49406,  2533, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
-         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
-         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
-         49407, 49407};
-
-    std::vector<int64_t> iam={1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0};
+    // Tokenize the prompt
+    std::vector<int64_t> iid(32, 49407); // 49407 is usually the PAD/EOS token for SAM3 text encoder
+    std::vector<int64_t> iam(32, 0);
+
+    try {
+        std::cout << "Calling Python to tokenize prompt: '" << prompt << "'..." << std::endl;
+        std::string py_out = exec_python_tokenizer(prompt);
 
+        if (py_out == "-1" || py_out.empty()) {
+            throw std::runtime_error("Python tokenizer script failed.");
+        }
+
+        // Parse comma separated string
+        std::vector<int32_t> ids;
+        std::stringstream ss(py_out);
+        std::string token;
+        while (std::getline(ss, token, ',')) {
+            ids.push_back(std::stoi(token));
+        }
+
+        for (size_t i = 0; i < ids.size() && i < 32; ++i) {
+            iid[i] = ids[i];
+            iam[i] = 1; // 1 for real tokens, 0 for pad
+        }
+        std::cout << "Successfully tokenized prompt into " << ids.size() << " tokens." << std::endl;
+
+    } catch (const std::exception& e) {
+        std::cerr << "Tokenizer error: " << e.what() << std::endl;
+        std::cout << "Falling back to 'person' tokens.\n";
+        iid = {49406, 2533, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
+               49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
+               49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
+               49407, 49407};
+        iam = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0};
+    }
+
     pcs.set_prompt(iid, iam);
 
+    const int MAX_BENCHMARK_IMAGES = 100;
+
     for (const auto& fname : std::filesystem::directory_iterator(in_dir))
     {
         if (std::filesystem::is_regular_file(fname.path())) 
         {
             std::filesystem::path outfile = std::filesystem::path("results") / fname.path().filename();
 
-            if (num_images_read==0)
+            img = cv::imread(fname.path(), cv::IMREAD_COLOR);
+            if (img.empty()) continue;
+
+            result.create(img.rows, img.cols, img.type());
+
+            if (img.cols != prev_width || img.rows != prev_height)
             {
-                cv::Mat tmp = cv::imread(fname.path(), cv::IMREAD_COLOR);
-                raw_bytes = (char *)malloc(tmp.total()*tmp.elemSize());
-                read_image_into_buffer(fname.path(), raw_bytes, img);
-                result = cv::imread(fname.path(), cv::IMREAD_COLOR);
                 pcs.pin_opencv_matrices(img, result);
+                prev_width = img.cols;
+                prev_height = img.rows;
             }
-            else
-            {
-                read_image_into_buffer(fname.path(), raw_bytes, img);
-            }
+
             start = std::chrono::system_clock::now();
-            infer_one_image(pcs, img, result, visualize, outfile, benchmark);
+            infer_one_image(pcs, img, result, visualize, outfile, prompt, benchmark);
             num_images_read++;
             end = std::chrono::system_clock::now();
             diff = end - start;
             millis_elapsed += (diff.count() * 1000);
 
             if (num_images_read>0 && num_images_read%10==0)
             {
-                float msec_per_image = millis_elapsed/num_images_read;
-                printf("Processed %d images at %f msec/image\n", num_images_read, msec_per_image);
+                printf("Processed %d images...\n", num_images_read);
             }
+
+            if (num_images_read >= MAX_BENCHMARK_IMAGES) break;
         }
     }
+
+    if (num_images_read > 0)
+    {
+        float msec_per_image = millis_elapsed/num_images_read;
+        float est_1000_min = msec_per_image * 1000.0f / 1000.0f / 60.0f;
+        printf("\n=== Benchmark Results ===\n");
+        printf("Processed %d images in %.1f s\n", num_images_read, millis_elapsed/1000.0f);
+        printf("Average: %.2f msec/image\n", msec_per_image);
+        printf("Estimated time for 1000 images: %.1f min\n", est_1000_min);
+    }
 }