Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 46 additions & 35 deletions preprocessors/object-detection-llm/object-detection-llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,33 +68,30 @@ def normalize_bbox(bbox, width, height):
]


def process_objects(objects, threshold):
def process_objects(qwen_output, width, height, threshold):
"""
Process detected objects by filtering, transforming, and enriching them.
Transform Qwen object detection output to IMAGE schema format.

- Filters objects by confidence threshold
- Transforms from Qwen format (bbox_2d, label) to IMAGE format
- Normalizes bounding boxes to [0,1] range
- Assigns confidence threshold to all objects
- Normalizes labels (replaces underscores with spaces)
- Renumbers IDs sequentially
- Calculates geometric properties (area, centroid)
- Filters objects by confidence threshold

Args:
objects (list): List of detected objects with confidence scores
qwen_output (list): Qwen detection output with bbox_2d and label
width (int): Image width in pixels for normalization
height (int): Image height in pixels for normalization
threshold (float): Minimum confidence score (0-1)

Returns:
list: Processed objects with computed properties
"""
processed = []
for obj in objects:
if obj.get("confidence", 0) >= threshold:
obj['type'] = obj['type'].replace('_', ' ')
processed.append(obj)

# Renumber IDs sequentially after filtering
for idx, obj in enumerate(processed):
obj['ID'] = idx

x1, y1, x2, y2 = obj["dimensions"]
for idx, item in enumerate(qwen_output):
# Normalize bounding box
x1, y1, x2, y2 = normalize_bbox(item["bbox_2d"], width, height)

# Calculate area (width * height)
area = (x2 - x1) * (y2 - y1)
Expand All @@ -103,13 +100,20 @@ def process_objects(objects, threshold):
centroid_x = (x1 + x2) / 2
centroid_y = (y1 + y2) / 2

# Create object entry according to schema
obj["area"] = area
obj["centroid"] = [centroid_x, centroid_y]
# Create object entry according to IMAGE schema
obj = {
"ID": idx,
"type": item["label"].replace('_', ' '),
"dimensions": [x1, y1, x2, y2],
"confidence": threshold,
"area": area,
"centroid": [centroid_x, centroid_y]
}

processed.append(obj)

logging.debug(
f"Processed {len(objects)} objects to {len(processed)} "
f"objects with confidence >= {threshold}"
f"Processed {len(qwen_output)} objects from Qwen output"
)
return processed

Expand Down Expand Up @@ -155,35 +159,42 @@ def detect_objects():
if error:
return jsonify(error), error["code"]

stop_tokens = [
"<|im_end|>", # Qwen's end token
"<|endoftext|>", # Alternative end token
"\n\n\n", # Triple newline
"```", # Code block end
]

try:
# Get object info
object_json = llm_client.chat_completion(
qwen_output = llm_client.chat_completion(
prompt=OBJECT_DETECTION_PROMPT,
image_base64=base64_image,
json_schema=BBOX_RESPONSE_SCHEMA,
temperature=0.0,
parse_json=True
temperature=0.5,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious why the move back to non-zero temperature?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is one of the theories on why this is happening: same problem reported on GitHub.
When the temperatue is default, it happens only sometimes, but when the temperature is set to 0, it happens every time.

I am not 100% sure it's affecting the performance in our specific case (too many variables: model, quantization, engine...), but I'm willing to try since it doesn't seem to affect the accuracy of outputs.

parse_json=True,
stop=stop_tokens
)

if object_json is None or len(object_json.get("objects", [])) == 0:
logging.debug(f"Qwen output received: {qwen_output}")

if qwen_output is None or len(qwen_output) == 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No content (204) isn't really an error, if there is legit nothing to extract. Should ideally distinguish between an actual error (something went wrong and nothing to report) vs. everything worked, but there were no objects to extract.

logging.error("Failed to extract objects from the graphic.")
return jsonify({"error": "No objects extracted"}), 204

# Normalize bounding boxes
# Transform Qwen format to IMAGE schema format
width, height = pil_image.size
for obj in object_json["objects"]:
# Normalize bounding boxes
obj["dimensions"] = normalize_bbox(
obj["dimensions"], width, height
)

# Filter objects by confidence threshold, add area and centroid,
# remove underscores from labels, and renumber IDs
object_json["objects"] = process_objects(
object_json["objects"],
processed_objects = process_objects(
qwen_output,
width,
height,
CONF_THRESHOLD
)

# Wrap in "objects" for schema compliance
object_json = {"objects": processed_objects}

logging.pii(f"Normalized output: {object_json}")

# Data schema validation
Expand Down
58 changes: 18 additions & 40 deletions preprocessors/object-detection-llm/object-detection.schema.json
Original file line number Diff line number Diff line change
@@ -1,45 +1,23 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"type": "object",
"type": "array",
"title": "Object Detection Data",
"description": "Detected object data with bounding boxes.",
"definitions": {
"object": {
"type": "object",
"title": "BoundingBoxItem",
"properties": {
"ID": {
"description": "A number identifying this object in the set.",
"type": "integer"
},
"type": {
"description": "The type of object detected (e.g., 'person', 'car').",
"type": "string"
},
"dimensions": {
"description": "Bounding box coordinates of this object [x1, y1, x2, y2].",
"type": "array",
"items": { "type": "number" },
"minItems": 4,
"maxItems": 4,
"additionalItems": false
},
"confidence": {
"description": "Confidence in the correctness of this object's data (0-1).",
"type": "number",
"minimum": 0,
"maximum": 1
}
"description": "Detected object data with bounding boxes in Qwen format.",
"items": {
"type": "object",
"properties": {
"bbox_2d": {
"description": "Bounding box coordinates [x1, y1, x2, y2].",
"type": "array",
"items": { "type": "number" },
"minItems": 4,
"maxItems": 4
},
"required": ["ID", "type", "dimensions", "confidence"]
}
},
"properties": {
"objects": {
"description": "The set of detected objects in the image.",
"type": "array",
"items": { "$ref": "#/definitions/object" }
}
},
"required": ["objects"]
"label": {
"description": "The type of object detected (e.g., 'person', 'car').",
"type": "string"
}
},
"required": ["bbox_2d", "label"]
}
}
34 changes: 15 additions & 19 deletions utils/llm/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,35 @@
"""
# Object detection
OBJECT_DETECTION_PROMPT = """
Give the bounding boxes for the objects found in this image.
Step 1:
Determine from 0 to 10 major and important objects in the image.
Focus ONLY on the objects that are clearly visible and identifiable.

Step 2:
Give the bounding boxes for the objects determined in the first step.
Output a only JSON list of bounding boxes where each entry contains:
- the unique numeric ID in the key "ID",
- the object label in the key "type",
- the pixel coordinates of a 2D bounding box in the key "dimensions",
- and the confidence score in the key "confidence".
- the pixel coordinates of a 2D bounding box in the key "bbox_2d",
- the object label in the key "label".

Example:
```json
{
"objects": [
[
{
"ID": 0,
"type": "car",
"dimensions": [120, 200, 300, 450],
"confidence": 0.92
"bbox_2d": [120, 200, 300, 450],
"label": "car",
},
{
"ID": 1,
"type": "person",
"dimensions": [50, 100, 120, 300],
"confidence": 0.95
"bbox_2d": [50, 100, 120, 300],
"label": "person",
}
]
}

]
```
Ensure that the bounding boxes are in the format [x1, y1, x2, y2].

Rules:
1. Focus ONLY on the major and important objects in the image.
2. The graphic can contain any number of objects, from zero to many.
3. If no objects are detected, return an empty list: {"objects": []}.
3. If no objects are detected, return an empty list: [].
4. Use simple and common object labels (e.g., "car", "person", "tree").
5. Include ONLY objects that are clearly visible and identifiable.
6. Multiple objects can have the same confidence score.
Expand Down