Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
5078a86
Add validate_output to all benchmarks and integrate into regression t…
Copilot Apr 15, 2026
7df4d28
Improve PUT /cart validation to check for empty dict response
Copilot Apr 15, 2026
771ef1c
Add --validate/--no-validate option to benchmark invoke CLI command
Copilot Apr 16, 2026
7372693
Address reviewer comments: improve benchmark output validation
Copilot Apr 16, 2026
58d3ad4
Fix PEP 8 style: split multi-line imports in validators
Copilot Apr 16, 2026
e82663e
[benchmarks] Validation for 110.dynamic-html
mcopik Apr 19, 2026
7de6ffb
[system] Simplify errors
mcopik Apr 19, 2026
2d33787
[system] Remove unnecessary redaction from default runs
mcopik Apr 19, 2026
6fbb81a
[benchmarks] Improved validation for 110.dynamic-html
mcopik Apr 19, 2026
8389386
[benchmarks] Validation for 120.uploader
mcopik Apr 19, 2026
7b46d52
[system] Improved validation of benchmark results
mcopik Apr 19, 2026
7efa207
[benchmarks] Add validation of 210.thumbnailer
mcopik Apr 19, 2026
c6c3897
[benchmarks] Add proper inputs to 220.video-processing and add valida…
mcopik Apr 19, 2026
8cf7bb3
[benchmarks] Fixed to 210.thumbnailer validation
mcopik Apr 19, 2026
30ba620
[benchmarks] Validation of benchmark 311
mcopik Apr 19, 2026
070c6bb
[benchmarks] Add verificaton of 504 benchmark
mcopik Apr 20, 2026
d488cbe
[benchmarks] Add validation of 130
mcopik Apr 20, 2026
31e7cc9
[benchmarks] Add validation of 411
mcopik Apr 20, 2026
ce38189
[system] Bump version
mcopik Apr 20, 2026
8e0f0bf
[system] Standardize benchmark validation interface
mcopik Apr 20, 2026
96ab08b
[benchmarks] Add validation for 501
mcopik Apr 20, 2026
c556bd9
[benchmarks] Validate 503
mcopik Apr 20, 2026
2d02447
[benchmarks] Major change: standardize output format of benchmarks be…
mcopik Apr 20, 2026
b2eceb4
[benchmarks] Major change: 502 will now return the full tree
mcopik Apr 20, 2026
2cb37d7
[benchmarks] Validate 502
mcopik Apr 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions benchmarks/000.microbenchmarks/010.sleep/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,10 @@ def buckets_count():

def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
return { 'sleep': size_generators[size] }

def validate_output(input_config: dict, output: dict, language: str, storage = None) -> str | None:

if output.get('result') != input_config.get('sleep'):
return f"Expected sleep duration {input_config.get('sleep')} but got {output.get('result')}"

return None
21 changes: 21 additions & 0 deletions benchmarks/100.webapps/110.dynamic-html/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,24 @@ def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths,
input_config = {'username': 'testname'}
input_config['random_len'] = size_generators[size]
return input_config

def validate_output(input_config: dict, output: dict, language: str, storage = None) -> str | None:

result = output.get('result', '')
username = input_config.get('username', '')
random_len = input_config.get('random_len', 0)

if not isinstance(result, str) or len(result) == 0:
return f"Output is not a non-empty string (type={type(result).__name__}, len={len(result) if isinstance(result, str) else 'N/A'})"

if f'Welcome {username}!' not in result:
return f"Missing expected username greeting 'Welcome {username}!' in HTML output"

if 'Data generated at:' not in result:
return "Missing expected timestamp text 'Data generated at:' in HTML output"

actual_li_count = result.count('<li>')
if actual_li_count != random_len:
return f"Expected {random_len} list items but found {actual_li_count} <li> tags in HTML output"

return None
2 changes: 2 additions & 0 deletions benchmarks/100.webapps/120.uploader/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
## Description

The benchmark implements the common workflow of uploading user-defined data to the persistent cloud storage. It accepts a URL, downloads file contents, and uploads them to the storage. Python implementation uses the standard library `requests`, while the Node.js version uses the third-party `requests` library installed with `npm`.

While 128 MB is technically sufficient for memory size, a larger memory value or a longer timeout might be required for the `large` input size.
2 changes: 1 addition & 1 deletion benchmarks/100.webapps/120.uploader/config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"timeout": 30,
"timeout": 60,
"memory": 128,
"languages": ["python", "nodejs"],
"modules": ["storage"]
Expand Down
56 changes: 55 additions & 1 deletion benchmarks/100.webapps/120.uploader/input.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved.
import hashlib
import os
import tempfile

url_generators = {
# source: mlperf fake_imagenet.sh. 230 kB
'test' : 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/Jammlich_crop.jpg/800px-Jammlich_crop.jpg',
# video: HPX source code, 6.7 MB
'small': 'https://github.com/STEllAR-GROUP/hpx/archive/refs/tags/1.4.0.zip',
# resnet model from pytorch. 98M
'large': 'https://download.pytorch.org/models/resnet50-19c8e357.pth'
'large': 'https://download.pytorch.org/models/resnet50-19c8e357.pth'
}

# MD5 checksums for stable reference objects.
# These are computed from the objects at the URLs above and must be updated
# if the remote files change.
expected_checksums = {
'test': '91799b8ca818598fc5b8790f3b338150',
'small': 'baf7ea99128aa3e5c2d0c8b8f61cce1b',
'large': '9e9c86b324d80e65229fab49b8d9a8e8'
}

def buckets_count():
Expand All @@ -15,6 +27,48 @@ def buckets_count():
def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func):
input_config = {'object': {}, 'bucket': {}}
input_config['object']['url'] = url_generators[size]
input_config['object']['size'] = size
input_config['bucket']['bucket'] = benchmarks_bucket
input_config['bucket']['output'] = output_buckets[0]
return input_config

def validate_output(input_config: dict, output: dict, language: str, storage = None) -> str | None:

result = output.get('result', {})
key = result.get('key', '')
url = input_config.get('object', {}).get('url', '')
size = input_config.get('object', {}).get('size', '')

if not isinstance(key, str) or len(key) == 0:
return f"Output key is missing or invalid (type={type(key).__name__}, value='{key}')"

if result.get('url') != url:
return f"Output URL mismatch: expected '{url}' but got '{result.get('url')}'"

if storage is None:
return None

bucket = input_config.get('bucket', {}).get('bucket', '')
expected_name = os.path.basename(url)
# Storage client adds unique hash: filename.{hash}.ext
# Check that the key contains the base filename and has the same extension
expected_base, expected_ext = os.path.splitext(expected_name)
key_base = os.path.basename(key)
if not (expected_base in key_base and key_base.endswith(expected_ext)):
return f"Storage key '{key_base}' does not match expected pattern (base='{expected_base}', ext='{expected_ext}')"

with tempfile.NamedTemporaryFile(delete=False) as f:
tmp_path = f.name
try:
storage.download(bucket, key, tmp_path)
file_size = os.path.getsize(tmp_path)
if file_size == 0:
return f"Downloaded file from storage is empty (bucket='{bucket}', key='{key}')"

with open(tmp_path, 'rb') as f:
actual_md5 = hashlib.md5(f.read()).hexdigest()
if actual_md5 != expected_checksums[size]:
return f"MD5 checksum mismatch for size '{size}': expected '{expected_checksums[size]}' but got '{actual_md5}'"
finally:
os.unlink(tmp_path)
return None
87 changes: 87 additions & 0 deletions benchmarks/100.webapps/130.crud-api/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,5 +93,92 @@ def generate_input(
)

input_config["requests"] = requests
input_config["size"] = size

return input_config

def validate_output(input_config: dict, output: dict, language: str, storage = None) -> str | None:

results = output.get('result', [])
requests = input_config.get('requests', [])

if not isinstance(results, list):
return f"Output results is not a list (type={type(results).__name__})"

if len(results) != len(requests):
return f"Results count mismatch: expected {len(requests)} responses but got {len(results)}"

if len(results) == 1:
"""
test input -> one result for a single cart item
small input -> one result for entire cart
"""
request = requests[0]
result = results[0]
route = request.get('route')

if route == 'GET /cart/{id}':

expected_item = {"name": "Gothic Game", "price": 42, "quantity": 2}
expected_item["cart_id"] = request["body"]["cart"]
expected_item["product_id"] = request["path"]["id"]

if expected_item != result:
return f"Wrong item details for GET /cart/{{id}}: expected {expected_item} but got {result}"

elif route == 'GET /cart':

products = [
("Gothic Game", 42, 2),
("Gothic 2", 142, 3),
("SeBS Benchmark", 1000, 1),
("Mint Linux", 0, 5)
]
total_cost = sum([p[1] * p[2] for p in products])
items = sum([p[2] for p in products])
cart = [p[0] for p in products]

if sorted(cart) != sorted(result['products']):
return f"Wrong product details for GET /cart: expected {cart} but got {result['products']}"

if total_cost != result['total_cost']:
return f"Wrong product details for GET /cart: expected {total_cost}, but got {result['total_cost']}"

if abs(total_cost / items - result['avg_price']) > 1e-6:
return f"Wrong product details for GET /cart: expected {total_cost/items}, but got {result['products']}"
else:
return f"Unexpected route in single-result output: expected 'GET /cart/{{id}}' or 'GET /cart' but got '{route}'"
else:
"""
large input -> 10 responses,
"""

put_results = results[0::2]
get_results = results[1::2]

for put_result in put_results:
if put_result != {}:
return f"PUT /cart expected empty dict {{}} but got {put_result}"

current_cost = 0
current_quantity = 0
items = []
for idx, get_result in enumerate(get_results):

items.append(f"Test Item {idx}")
current_cost += 100 * idx * idx
current_quantity += idx

if get_result['products'] != items:
return f"Wrong product details for GET /cart: expected {items} but got {get_result['products']}"

if current_cost != get_result['total_cost']:
return f"Wrong product details for GET /cart: expected {current_cost}, but got {get_result['total_cost']}"

if current_quantity == 0:
continue

if abs(current_cost / current_quantity - get_result['avg_price']) > 1e-6:
return f"Wrong product details for GET /cart: expected {current_cost/current_quantity}, but got {get_result['products']}"

return None
2 changes: 1 addition & 1 deletion benchmarks/100.webapps/130.crud-api/python/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def query_products(cart_id: str):
for product in res:

products.append(product["name"])
price_sum += product["price"]
price_sum += product["price"] * product["quantity"]
quantity_sum += product["quantity"]

avg_price = price_sum / quantity_sum if quantity_sum > 0 else 0.0
Expand Down
44 changes: 43 additions & 1 deletion benchmarks/200.multimedia/210.thumbnailer/input.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved.
import glob, os
import glob
import os
import tempfile

from PIL import Image

def buckets_count():
return (1, 1)
Expand Down Expand Up @@ -28,3 +32,41 @@ def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths,
input_config['bucket']['input'] = input_paths[0]
input_config['bucket']['output'] = output_paths[0]
return input_config

def validate_output(input_config: dict, output: dict, language: str, storage = None) -> str | None:

result = output.get('result', {})
key = result.get('key', '')

if not isinstance(key, str) or len(key) == 0:
return f"Output key is missing or invalid (type={type(key).__name__}, value='{key}')"

if storage is None:
return None

bucket = input_config.get('bucket', {}).get('bucket', '')
max_width = input_config.get('object', {}).get('width', 0)
max_height = input_config.get('object', {}).get('height', 0)

with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as f:
tmp_path = f.name
try:
storage.download(bucket, key, tmp_path)
file_size = os.path.getsize(tmp_path)
if file_size == 0:
return f"Downloaded thumbnail from storage is empty (bucket='{bucket}', key='{key}')"

try:
with Image.open(tmp_path) as img:
w, h = img.size
if w <= 0 or h <= 0:
return f"Thumbnail has invalid dimensions: width={w}, height={h}"
if w > max_width:
return f"Thumbnail width {w} exceeds maximum {max_width}"
if h > max_height:
return f"Thumbnail height {h} exceeds maximum {max_height}"
return None
except Exception as e:
return f"Failed to open or validate thumbnail image: {str(e)}"
finally:
os.unlink(tmp_path)
2 changes: 2 additions & 0 deletions benchmarks/200.multimedia/220.video-processing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
## Description

The benchmark implements two operations on video files: adding a watermark and creating a gif. Both input and output media are passed through the cloud storage. To process the video, the benchmark uses `ffmpeg`. The benchmark installs the most recent static binary of `ffmpeg` provided by [John van Sickle](https://johnvansickle.com/ffmpeg/).

While 512 MB is technically sufficient and works well for the watermark operations, the `large` input converts the video into a gif which is more computationally intensive. On AWS, you should expect around ~30 seconds runtime on 1024 MiB allocation.
72 changes: 66 additions & 6 deletions benchmarks/200.multimedia/220.video-processing/input.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved.
import glob, os
import glob
import os
import tempfile
import hashlib

def buckets_count():
return (1, 1)

# MD5 checksums for video processing output (operation, duration) -> hash
# These checksums ensure ffmpeg produces deterministic output
expected_checksums = {
('watermark', 1): '87f3a1ef9d90f93fd24c19ad0209a913',
('watermark', 3): '98286aa95fdbd7501b2cf244027c0ca2',
('extract-gif', 2): '20c17009382df93f6fcbf7ba1c53def0'
}

'''
Generate test, small and large workload for thumbnailer.
Generate test, small and large workload for video processing.

:param data_dir: directory where benchmark data is placed
:param size: workload size
Expand All @@ -17,12 +28,61 @@ def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths,
for file in glob.glob(os.path.join(data_dir, '*.mp4')):
img = os.path.relpath(file, data_dir)
upload_func(0, img, file)
#TODO: multiple datasets

# Different operations for different sizes to test various video processing modes
# Note: extract-gif can timeout on some configurations (long and heavy)
size_configs = {
'test': {'op': 'watermark', 'duration': 1},
'small': {'op': 'watermark', 'duration': 3},
'large': {'op': 'extract-gif', 'duration': 2},
}

config = size_configs.get(size, size_configs['test'])

input_config = {'object': {}, 'bucket': {}}
input_config['object']['key'] = img
input_config['object']['op'] = 'watermark'
input_config['object']['duration'] = 1
input_config['object']['key'] = "city.mp4"
input_config['object']['op'] = config['op']
input_config['object']['duration'] = config['duration']
input_config['bucket']['bucket'] = benchmarks_bucket
input_config['bucket']['input'] = input_paths[0]
input_config['bucket']['output'] = output_paths[0]
return input_config

def validate_output(input_config: dict, output: dict, language: str, storage = None) -> str | None:

result = output.get('result', {})
key = result.get('key', '')

if not isinstance(key, str) or len(key) == 0:
return f"Output key is missing or invalid (type={type(key).__name__}, value='{key}')"

if storage is None:
return None

bucket = input_config.get('bucket', {}).get('bucket', '')
op = input_config.get('object', {}).get('op', '')
duration = input_config.get('object', {}).get('duration', 0)

suffix = os.path.splitext(key)[1] or '.tmp'
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
tmp_path = f.name
try:
storage.download(bucket, key, tmp_path)
file_size = os.path.getsize(tmp_path)
if file_size == 0:
return f"Downloaded video output from storage is empty (bucket='{bucket}', key='{key}')"

# Check MD5 checksum if available for this operation
checksum_key = (op, duration)
if checksum_key not in expected_checksums:
return f"Missing validation configuration for ({op}, {duration})!"

with open(tmp_path, 'rb') as f:
actual_md5 = hashlib.md5(f.read()).hexdigest()
expected_md5 = expected_checksums[checksum_key]
if actual_md5 != expected_md5:
return f"MD5 checksum mismatch for op='{op}' duration={duration}: expected '{expected_md5}' but got '{actual_md5}'"

return None
finally:
os.unlink(tmp_path)
Loading
Loading