apache
diff --git a/‎scripts/staging/llm-bench/README.md‎
Lines changed: 682 additions & 58 deletions b/‎scripts/staging/llm-bench/README.md‎
Lines changed: 682 additions & 58 deletions
diff --git a/‎scripts/staging/llm-bench/backends/openai_backend.py‎
Lines changed: 7 additions & 4 deletions b/‎scripts/staging/llm-bench/backends/openai_backend.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎scripts/staging/llm-bench/backends/systemds_backend.py‎
Lines changed: 4 additions & 3 deletions b/‎scripts/staging/llm-bench/backends/systemds_backend.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎scripts/staging/llm-bench/backends/vllm_backend.py‎
Lines changed: 13 additions & 44 deletions b/‎scripts/staging/llm-bench/backends/vllm_backend.py‎
Lines changed: 13 additions & 44 deletions
diff --git a/‎scripts/staging/llm-bench/results/summary.csv‎
Lines changed: 16 additions & 16 deletions b/‎scripts/staging/llm-bench/results/summary.csv‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎scripts/staging/llm-bench/results/systemds_qwen3b_embeddings/manifest.json‎
Lines changed: 7 additions & 7 deletions b/‎scripts/staging/llm-bench/results/systemds_qwen3b_embeddings/manifest.json‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎scripts/staging/llm-bench/results/systemds_qwen3b_embeddings/metrics.json‎
Lines changed: 29 additions & 29 deletions b/‎scripts/staging/llm-bench/results/systemds_qwen3b_embeddings/metrics.json‎
Lines changed: 29 additions & 29 deletions
@@ -75,6 +75,7 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
         model = config.get("model", "gpt-4.1-mini")
         max_output_tokens = int(config.get("max_output_tokens", config.get("max_tokens", 256)))
         temperature = config.get("temperature", 0.0)
+        top_p = float(config.get("top_p", 0.9))
         use_streaming = config.get("streaming", False)
         max_retries = int(config.get("max_retries", 5))
         base_sleep = float(config.get("base_sleep_s", 0.5))
@@ -87,11 +88,11 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
                 try:
                     if use_streaming:
                         result = self._generate_streaming(
-                            prompt, model, max_output_tokens, temperature
+                            prompt, model, max_output_tokens, temperature, top_p
                         )
                     else:
                         result = self._generate_non_streaming(
-                            prompt, model, max_output_tokens, temperature
+                            prompt, model, max_output_tokens, temperature, top_p
                         )
 
                     results.append(result)
@@ -112,13 +113,14 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
 
         return results
 
-    def _generate_non_streaming(self, prompt: str, model: str, max_output_tokens: int, temperature: float) -> Dict[str, Any]:
+    def _generate_non_streaming(self, prompt: str, model: str, max_output_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
         t0 = time.perf_counter()
         resp = self.client.responses.create(
             model=model,
             input=prompt,
             max_output_tokens=max_output_tokens,
             temperature=temperature,
+            top_p=top_p,
         )
         t1 = time.perf_counter()
 
@@ -141,13 +143,14 @@ def _generate_non_streaming(self, prompt: str, model: str, max_output_tokens: in
             "extra": extra,
         }
 
-    def _generate_streaming(self, prompt: str, model: str, max_output_tokens: int, temperature: float) -> Dict[str, Any]:
+    def _generate_streaming(self, prompt: str, model: str, max_output_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
         t0 = time.perf_counter()
         stream = self.client.responses.create(
             model=model,
             input=prompt,
             max_output_tokens=max_output_tokens,
             temperature=temperature,
+            top_p=top_p,
             stream=True,
         )
 
 
@@ -38,7 +38,7 @@
 # DML script that uses the native llmPredict built-in
 _DML_SCRIPT = (
     'prompts = read("prompts", data_type="frame")\n'
-    'results = llmPredict(target=prompts, url=$url, max_tokens=$mt,'
+    'results = llmPredict(target=prompts, url=$url, model=$model, max_tokens=$mt,'
     ' temperature=$temp, top_p=$tp, concurrency=$conc)\n'
     'write(results, "results")'
 )
@@ -105,13 +105,14 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
 
         t_pipeline_start = time.perf_counter()
 
-        script_key = (self.inference_url, max_tokens, temperature, top_p, concurrency)
+        script_key = (self.inference_url, self.model, max_tokens, temperature, top_p, concurrency)
         if script_key in self._script_cache:
             ps = self._script_cache[script_key]
             logger.debug("Reusing cached PreparedScript for key %s", script_key)
         else:
             args = self._gateway.jvm.java.util.HashMap()
             args.put("$url", self.inference_url)
+            args.put("$model", self.model)
             args.put("$mt", str(max_tokens))
             args.put("$temp", str(temperature))
             args.put("$tp", str(top_p))
@@ -147,7 +148,7 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
             if "java.net.SocketTimeoutException" in err_msg:
                 raise RuntimeError(
                     "Inference server timed out. The server may be overloaded "
-                    "or the read timeout (120 s) was exceeded."
+                    "or the read timeout (300 s) was exceeded."
                 ) from e
             raise RuntimeError(
                 f"SystemDS executeScript failed: {err_msg}"
 
@@ -21,7 +21,6 @@
 
 """vLLM backend -- connects to a running vLLM OpenAI-compatible server."""
 
-import json
 import logging
 import os
 import time
@@ -53,63 +52,38 @@ def __init__(self, model: str, base_url: str = None):
     def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str, Any]]:
         max_tokens = int(config.get("max_tokens", config.get("max_output_tokens", 512)))
         temperature = float(config.get("temperature", 0.0))
+        top_p = float(config.get("top_p", 0.9))
         results = []
         for prompt in prompts:
             try:
-                results.append(self._generate_single(prompt, max_tokens, temperature))
+                results.append(self._generate_single(prompt, max_tokens, temperature, top_p))
             except Exception as e:
                 logger.error("vLLM generation failed: %s", e)
                 results.append({"text": "", "latency_ms": 0.0, "extra": {"error": repr(e)}})
         return results
 
-    def _generate_single(self, prompt: str, max_tokens: int, temperature: float) -> Dict[str, Any]:
+    def _generate_single(self, prompt: str, max_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
         payload = {
             "model": self.model,
             "prompt": prompt,
             "max_tokens": max_tokens,
             "temperature": temperature,
-            "stream": True,
+            "top_p": top_p,
+            "stream": False,
         }
 
         t0 = time.perf_counter()
-        t_first = None
-        chunks = []
-        usage_data = None
-
-        with requests.post(
+        resp = requests.post(
             f"{self.base_url}/v1/completions",
             json=payload,
             headers={"Content-Type": "application/json"},
-            stream=True,
-            timeout=300,
-        ) as resp:
-            resp.raise_for_status()
-            for line in resp.iter_lines():
-                if not line:
-                    continue
-                line = line.decode("utf-8")
-                if not line.startswith("data: "):
-                    continue
-                data_str = line[6:]
-                if data_str == "[DONE]":
-                    break
-                try:
-                    chunk = json.loads(data_str)
-                except json.JSONDecodeError:
-                    continue
-
-                choices = chunk.get("choices", [])
-                if choices and t_first is None and choices[0].get("text"):
-                    t_first = time.perf_counter()
-                for choice in choices:
-                    t = choice.get("text", "")
-                    if t:
-                        chunks.append(t)
-                if "usage" in chunk:
-                    usage_data = chunk["usage"]
-
+            timeout=(10, 300),
+        )
         t1 = time.perf_counter()
-        text = "".join(chunks)
+        resp.raise_for_status()
+
+        body = resp.json()
+        text = body["choices"][0]["text"]
         total_ms = (t1 - t0) * 1000.0
 
         result: Dict[str, Any] = {
@@ -118,12 +92,7 @@ def _generate_single(self, prompt: str, max_tokens: int, temperature: float) ->
             "extra": {},
         }
 
-        # only report TTFT if we actually measured first-token arrival
-        if t_first is not None:
-            result["ttft_ms"] = (t_first - t0) * 1000.0
-            result["generation_ms"] = (t1 - t_first) * 1000.0
-
-        # only report token counts if the server returned them
+        usage_data = body.get("usage")
         if usage_data:
             result["extra"]["usage"] = {
                 "input_tokens": usage_data.get("prompt_tokens", 0),
 
@@ -1,16 +1,16 @@
-run_dir,ts,backend,backend_model,workload,n,accuracy_mean,accuracy_count,api_cost_usd,cost_per_1m_tokens,electricity_cost_usd,hardware_amortization_usd,total_compute_cost_usd,memory_mb_peak,cpu_percent_avg,latency_ms_mean,latency_ms_std,latency_ms_min,latency_ms_max,latency_ms_p50,latency_ms_p95,latency_ms_cv,throughput_req_per_s,total_tokens,avg_tokens,total_input_tokens,total_output_tokens,ttft_ms_mean,generation_ms_mean,concurrency,rouge1_f,rouge2_f,rougeL_f
-vllm_qwen3b_math,2026-02-25T03:44:40.368334+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,math,50.0,0.6800,34/50,0.000000,0.0000,0.002787,0.053089,0.055876,149.0,2.8,1911.2006275495514,541.36,729.41,2549.99,1920.4873470589519,2544.023859174922,0.2833,0.5231831373353106,,,,,29.76,1881.44,,,,
-vllm_qwen3b_reasoning,2026-02-25T03:45:36.053948+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,reasoning,50.0,0.6400,32/50,0.000000,0.0000,0.001531,0.029167,0.030698,148.0,2.5,1050.0086006894708,280.14,501.65,1639.09,1048.1926704524085,1565.7067460240796,0.2668,0.9522370949510227,,,,,26.41,1023.60,,,,
-vllm_qwen3b_summarization,2026-02-25T03:45:57.279603+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,summarization,50.0,0.6200,31/50,0.000000,0.0000,0.000521,0.009930,0.010451,148.0,2.6,357.48520554974675,140.51,157.27,660.93,334.4448370626196,653.8579267216846,0.3931,2.7960121385357626,,,,,32.33,325.16,,0.2256,0.0561,0.1573
-vllm_qwen3b_json_extraction,2026-02-25T03:46:24.926495+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,json_extraction,50.0,0.5200,26/50,0.000000,0.0000,0.000756,0.014405,0.015161,146.0,2.1,518.5651833564043,167.29,297.37,972.98,457.4713610345498,782.0741413393989,0.3226,1.9278017596520927,,,,,30.67,487.89,,,,
-vllm_qwen3b_embeddings,2026-02-25T03:46:30.729177+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,embeddings,50.0,0.9000,45/50,0.000000,0.0000,0.000069,0.001323,0.001393,152.9,7.6,47.637368431314826,3.35,42.45,53.91,46.51483451016247,52.689887944143265,0.0703,20.93053802290314,,,,,29.27,18.36,,,,
-systemds_qwen3b_math,2026-02-25T16:43:43.522303+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,math,50.0,0.6800,34/50,0.000000,2.4199,0.002806,0.053444,0.056250,158.0,0.2,1924.0,543.79,746.00,2568.00,1923.0,2564.0,0.2826,0.5167099295194738,23245,464.9000,4016,19229,,,,,,
-systemds_qwen3b_reasoning,2026-02-25T16:44:43.564082+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,reasoning,50.0,0.6000,30/50,0.000000,1.5940,0.001610,0.030667,0.032277,158.0,0.2,1104.02,325.27,542.00,2101.00,1097.0,1641.8999999999999,0.2946,0.898948616822443,20249,404.9800,9337,10912,,,,,,
-systemds_qwen3b_summarization,2026-02-25T16:45:06.721049+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,summarization,50.0,0.5000,25/50,0.000000,0.6825,0.000535,0.010181,0.010716,160.0,0.5,366.52,145.11,162.00,661.00,354.0,637.1499999999999,0.3959,2.6577487014516517,15701,314.0200,12242,3459,,,,0.2198,0.0566,0.1566
-systemds_qwen3b_json_extraction,2026-02-25T16:45:36.426217+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,json_extraction,50.0,0.5200,26/50,0.000000,1.4096,0.000771,0.014680,0.015451,157.0,0.5,528.48,169.30,306.00,983.00,465.5,797.85,0.3203,1.8517758760221859,10961,219.2200,5919,5042,,,,,,
-systemds_qwen3b_embeddings,2026-02-25T16:45:52.947905+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,embeddings,50.0,0.9000,45/50,0.000000,0.3540,0.000068,0.001291,0.001359,158.0,2.3,46.48,7.70,40.00,93.00,44.0,57.099999999999994,0.1657,18.04542374908207,3839,76.7800,3589,250,,,,,,
-openai_math,2026-02-27T20:14:30.513077+00:00,openai,gpt-4.1-mini,math,50.0,0.9600,48/50,0.022309,1.3070,0.000000,0.000000,0.000000,164.6,1.6,4576.56230248,1248.05,2117.60,8749.62,4322.632333500011,6435.652603799997,0.2727,0.2184699489817528,17069,341.3800,4168,12901,778.75,3797.81,,,,
-openai_reasoning,2026-02-27T20:16:01.652306+00:00,openai,gpt-4.1-mini,reasoning,50.0,0.8800,44/50,0.009984,0.7771,0.000000,0.000000,0.000000,166.0,3.1,1735.1548483399995,619.02,904.09,4881.41,1688.0436875000023,2549.1829626499934,0.3568,0.5760060829950922,12848,256.9600,8811,4037,550.27,1184.88,,,,
-openai_summarization,2026-02-27T20:17:04.370727+00:00,openai,gpt-4.1-mini,summarization,50.0,0.8600,43/50,0.007539,0.5451,0.000000,0.000000,0.000000,176.8,3.8,1130.7603850000003,384.29,722.89,3368.68,1058.428687499999,1499.2941524499975,0.3399,0.8837403133927044,13832,276.6400,12160,1672,632.09,498.67,,0.2697,0.0660,0.2009
-openai_json_extraction,2026-02-27T20:18:33.535271+00:00,openai,gpt-4.1-mini,json_extraction,46.0,0.6087,28/46,0.005592,0.8311,0.000000,0.000000,0.000000,176.7,2.9,1497.5694981521735,842.46,789.29,5559.85,1283.2824784999984,2850.8575522500028,0.5625,0.6672573098693623,6728,146.2609,4311,2417,638.97,858.60,,,,
-openai_embeddings,2026-02-27T20:19:16.367666+00:00,openai,gpt-4.1-mini,embeddings,50.0,0.8800,44/50,0.001894,0.4580,0.000000,0.000000,0.000000,189.1,1.5,773.4331351200003,274.06,508.85,1767.04,688.1857915000014,1307.1644873500004,0.3543,1.2907764523552046,4135,82.7000,3935,200,632.07,141.36,,,,
+directory,backend,model,workload,n,accuracy,accuracy_mean,latency_ms_mean,throughput_rps,total_compute_cost_usd
+openai_embeddings,openai,gpt-4.1-mini,embeddings,50,44/50,0.88,773.4,1.29,0.001894
+openai_json_extraction,openai,gpt-4.1-mini,json_extraction,46,28/46,0.6086956521739131,1497.6,0.67,0.005592
+openai_math,openai,gpt-4.1-mini,math,50,48/50,0.96,4576.6,0.22,0.022309
+openai_reasoning,openai,gpt-4.1-mini,reasoning,50,44/50,0.88,1735.2,0.58,0.009984
+openai_summarization,openai,gpt-4.1-mini,summarization,50,43/50,0.86,1130.8,0.88,0.007539
+systemds_qwen3b_embeddings,systemds,Qwen/Qwen2.5-3B-Instruct,embeddings,50,45/50,0.9,46.5,18.05,0.00162
+systemds_qwen3b_json_extraction,systemds,Qwen/Qwen2.5-3B-Instruct,json_extraction,50,26/50,0.52,528.5,1.85,0.015788
+systemds_qwen3b_math,systemds,Qwen/Qwen2.5-3B-Instruct,math,50,34/50,0.68,1924.0,0.52,0.056581
+systemds_qwen3b_reasoning,systemds,Qwen/Qwen2.5-3B-Instruct,reasoning,50,31/50,0.62,1104.0,0.9,0.032523
+systemds_qwen3b_summarization,systemds,Qwen/Qwen2.5-3B-Instruct,summarization,50,25/50,0.5,366.5,2.66,0.011
+vllm_qwen3b_embeddings,vllm,Qwen/Qwen2.5-3B-Instruct,embeddings,50,45/50,0.9,47.6,20.93,0.001397
+vllm_qwen3b_json_extraction,vllm,Qwen/Qwen2.5-3B-Instruct,json_extraction,50,26/50,0.52,518.6,1.93,0.015166
+vllm_qwen3b_math,vllm,Qwen/Qwen2.5-3B-Instruct,math,50,34/50,0.68,1911.2,0.52,0.055881
+vllm_qwen3b_reasoning,vllm,Qwen/Qwen2.5-3B-Instruct,reasoning,50,33/50,0.66,1050.0,0.95,0.030703
+vllm_qwen3b_summarization,vllm,Qwen/Qwen2.5-3B-Instruct,summarization,50,31/50,0.62,357.5,2.8,0.010456
@@ -1,6 +1,6 @@
 {
   "git_commit_hash": "5d47925ed6a7953871e90ecc5f27c0a0e7f3ac6a",
-  "timestamp_utc": "2026-02-25T16:45:52.947905+00:00",
+  "timestamp_utc": "2026-03-02T06:30:27.949385+00:00",
   "python_version": "3.12.3 (main, Aug 14 2025, 17:47:21) [GCC 13.3.0]",
   "platform": {
     "os": "Linux",
@@ -17,26 +17,26 @@
         "index": 0,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 17830.625,
-        "memory_free_mb": 63728.375,
+        "memory_used_mb": 30990.6875,
+        "memory_free_mb": 50568.3125,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       },
       {
         "index": 1,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 81372.1875,
-        "memory_free_mb": 186.8125,
+        "memory_used_mb": 74773.125,
+        "memory_free_mb": 6785.875,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       },
       {
         "index": 2,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 483.0625,
-        "memory_free_mb": 81075.9375,
+        "memory_used_mb": 485.5625,
+        "memory_free_mb": 81073.4375,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       }
 
@@ -1,49 +1,53 @@
 {
   "n": 50.0,
-  "latency_ms_mean": 46.48,
-  "latency_ms_std": 7.699974025930217,
-  "latency_ms_min": 40.0,
-  "latency_ms_max": 93.0,
-  "latency_ms_p50": 44.0,
-  "latency_ms_p95": 57.099999999999994,
-  "latency_ms_cv": 0.16566209177991,
-  "throughput_req_per_s": 18.04542374908207,
+  "latency_ms_mean": 41.1,
+  "latency_ms_std": 6.014149981501958,
+  "latency_ms_min": 38.0,
+  "latency_ms_max": 80.0,
+  "latency_ms_p50": 40.0,
+  "latency_ms_p95": 45.55,
+  "latency_ms_cv": 0.14632968324822282,
+  "throughput_req_per_s": 20.013688481144992,
   "accuracy_mean": 0.9,
   "accuracy_count": "45/50",
   "total_input_tokens": 3589,
   "total_output_tokens": 250,
   "total_tokens": 3839,
-  "memory_mb_initial": 139.0,
+  "electricity_kwh": 0.00024288931626425536,
+  "electricity_cost_usd": 7.286679487927661e-05,
+  "hardware_amortization_usd": 0.0013879389500814593,
+  "total_compute_cost_usd": 0.0014608057449607358,
+  "memory_mb_initial": 138.0,
   "memory_mb_peak": 158.0,
-  "memory_mb_avg": 154.83333333333334,
-  "cpu_percent_avg": 2.3333333333333335,
+  "memory_mb_avg": 154.66666666666666,
+  "cpu_percent_avg": 2.6666666666666665,
   "gpu_info": {
     "gpu_count": 3,
     "gpus": [
       {
         "index": 0,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 17830.625,
-        "memory_free_mb": 63728.375,
+        "memory_used_mb": 30990.6875,
+        "memory_free_mb": 50568.3125,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       },
       {
         "index": 1,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 81372.1875,
-        "memory_free_mb": 186.8125,
+        "memory_used_mb": 74773.125,
+        "memory_free_mb": 6785.875,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       },
       {
         "index": 2,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 483.0625,
-        "memory_free_mb": 81075.9375,
+        "memory_used_mb": 485.5625,
+        "memory_free_mb": 81073.4375,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       }
@@ -56,33 +60,29 @@
         "index": 0,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 17830.625,
-        "memory_free_mb": 63728.375,
+        "memory_used_mb": 30990.6875,
+        "memory_free_mb": 50568.3125,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       },
       {
         "index": 1,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 81372.1875,
-        "memory_free_mb": 186.8125,
+        "memory_used_mb": 74773.125,
+        "memory_free_mb": 6785.875,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       },
       {
         "index": 2,
         "name": "NVIDIA H100 PCIe",
         "memory_total_mb": 81559.0,
-        "memory_used_mb": 483.0625,
-        "memory_free_mb": 81075.9375,
+        "memory_used_mb": 485.5625,
+        "memory_free_mb": 81073.4375,
         "gpu_utilization_pct": 0,
         "memory_utilization_pct": 0
       }
     ]
-  },
-  "electricity_kwh": 0.00022594444444444439,
-  "electricity_cost_usd": 6.778333333333332e-05,
-  "hardware_amortization_usd": 0.001291111111111111,
-  "total_compute_cost_usd": 0.0013588944444444442
-}
+  }
+}