Skip to content

Commit 2b427fb

Browse files
committed
Add CUBLAS determinism experiment, APC root cause analysis, and updated results
- Switch vllm_backend.py to stream=false to match SystemDS - Update results/ with CUBLAS deterministic run (vLLM + SystemDS, H100, Mar 2) - README: add CUBLAS experiment results table (207/250 = 82.8% identical) - README: document vLLM Automatic Prefix Caching (APC) as root cause of remaining 43 divergent samples; proven by order-reversal experiment (43/43 swap, 0 exceptions) - README: add server log evidence with prefix cache hit rate 9% -> 55% - README: add concrete swap examples including factual hallucinations (xsum-30 athlete name, xsum-42 year, xsum-89 country) that follow cache state not backend identity - README: explain APC mechanism: cold cache runs full prefill kernel, warm cache skips prefill and loads stored KV tensors through different code path; outputs deterministic given fixed cache state, temperature=0, deterministic cuBLAS, sequential requests - Fix LlmPredictCPInstruction, JMLCLLMInferenceTest, workload configs, and test files
1 parent bf72b49 commit 2b427fb

45 files changed

Lines changed: 2046 additions & 939 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

scripts/staging/llm-bench/README.md

Lines changed: 682 additions & 58 deletions
Large diffs are not rendered by default.

scripts/staging/llm-bench/backends/openai_backend.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
7575
model = config.get("model", "gpt-4.1-mini")
7676
max_output_tokens = int(config.get("max_output_tokens", config.get("max_tokens", 256)))
7777
temperature = config.get("temperature", 0.0)
78+
top_p = float(config.get("top_p", 0.9))
7879
use_streaming = config.get("streaming", False)
7980
max_retries = int(config.get("max_retries", 5))
8081
base_sleep = float(config.get("base_sleep_s", 0.5))
@@ -87,11 +88,11 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
8788
try:
8889
if use_streaming:
8990
result = self._generate_streaming(
90-
prompt, model, max_output_tokens, temperature
91+
prompt, model, max_output_tokens, temperature, top_p
9192
)
9293
else:
9394
result = self._generate_non_streaming(
94-
prompt, model, max_output_tokens, temperature
95+
prompt, model, max_output_tokens, temperature, top_p
9596
)
9697

9798
results.append(result)
@@ -112,13 +113,14 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
112113

113114
return results
114115

115-
def _generate_non_streaming(self, prompt: str, model: str, max_output_tokens: int, temperature: float) -> Dict[str, Any]:
116+
def _generate_non_streaming(self, prompt: str, model: str, max_output_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
116117
t0 = time.perf_counter()
117118
resp = self.client.responses.create(
118119
model=model,
119120
input=prompt,
120121
max_output_tokens=max_output_tokens,
121122
temperature=temperature,
123+
top_p=top_p,
122124
)
123125
t1 = time.perf_counter()
124126

@@ -141,13 +143,14 @@ def _generate_non_streaming(self, prompt: str, model: str, max_output_tokens: in
141143
"extra": extra,
142144
}
143145

144-
def _generate_streaming(self, prompt: str, model: str, max_output_tokens: int, temperature: float) -> Dict[str, Any]:
146+
def _generate_streaming(self, prompt: str, model: str, max_output_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
145147
t0 = time.perf_counter()
146148
stream = self.client.responses.create(
147149
model=model,
148150
input=prompt,
149151
max_output_tokens=max_output_tokens,
150152
temperature=temperature,
153+
top_p=top_p,
151154
stream=True,
152155
)
153156

scripts/staging/llm-bench/backends/systemds_backend.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
# DML script that uses the native llmPredict built-in
3939
_DML_SCRIPT = (
4040
'prompts = read("prompts", data_type="frame")\n'
41-
'results = llmPredict(target=prompts, url=$url, max_tokens=$mt,'
41+
'results = llmPredict(target=prompts, url=$url, model=$model, max_tokens=$mt,'
4242
' temperature=$temp, top_p=$tp, concurrency=$conc)\n'
4343
'write(results, "results")'
4444
)
@@ -105,13 +105,14 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
105105

106106
t_pipeline_start = time.perf_counter()
107107

108-
script_key = (self.inference_url, max_tokens, temperature, top_p, concurrency)
108+
script_key = (self.inference_url, self.model, max_tokens, temperature, top_p, concurrency)
109109
if script_key in self._script_cache:
110110
ps = self._script_cache[script_key]
111111
logger.debug("Reusing cached PreparedScript for key %s", script_key)
112112
else:
113113
args = self._gateway.jvm.java.util.HashMap()
114114
args.put("$url", self.inference_url)
115+
args.put("$model", self.model)
115116
args.put("$mt", str(max_tokens))
116117
args.put("$temp", str(temperature))
117118
args.put("$tp", str(top_p))
@@ -147,7 +148,7 @@ def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str,
147148
if "java.net.SocketTimeoutException" in err_msg:
148149
raise RuntimeError(
149150
"Inference server timed out. The server may be overloaded "
150-
"or the read timeout (120 s) was exceeded."
151+
"or the read timeout (300 s) was exceeded."
151152
) from e
152153
raise RuntimeError(
153154
f"SystemDS executeScript failed: {err_msg}"

scripts/staging/llm-bench/backends/vllm_backend.py

Lines changed: 13 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
"""vLLM backend -- connects to a running vLLM OpenAI-compatible server."""
2323

24-
import json
2524
import logging
2625
import os
2726
import time
@@ -53,63 +52,38 @@ def __init__(self, model: str, base_url: str = None):
5352
def generate(self, prompts: List[str], config: Dict[str, Any]) -> List[Dict[str, Any]]:
5453
max_tokens = int(config.get("max_tokens", config.get("max_output_tokens", 512)))
5554
temperature = float(config.get("temperature", 0.0))
55+
top_p = float(config.get("top_p", 0.9))
5656
results = []
5757
for prompt in prompts:
5858
try:
59-
results.append(self._generate_single(prompt, max_tokens, temperature))
59+
results.append(self._generate_single(prompt, max_tokens, temperature, top_p))
6060
except Exception as e:
6161
logger.error("vLLM generation failed: %s", e)
6262
results.append({"text": "", "latency_ms": 0.0, "extra": {"error": repr(e)}})
6363
return results
6464

65-
def _generate_single(self, prompt: str, max_tokens: int, temperature: float) -> Dict[str, Any]:
65+
def _generate_single(self, prompt: str, max_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
6666
payload = {
6767
"model": self.model,
6868
"prompt": prompt,
6969
"max_tokens": max_tokens,
7070
"temperature": temperature,
71-
"stream": True,
71+
"top_p": top_p,
72+
"stream": False,
7273
}
7374

7475
t0 = time.perf_counter()
75-
t_first = None
76-
chunks = []
77-
usage_data = None
78-
79-
with requests.post(
76+
resp = requests.post(
8077
f"{self.base_url}/v1/completions",
8178
json=payload,
8279
headers={"Content-Type": "application/json"},
83-
stream=True,
84-
timeout=300,
85-
) as resp:
86-
resp.raise_for_status()
87-
for line in resp.iter_lines():
88-
if not line:
89-
continue
90-
line = line.decode("utf-8")
91-
if not line.startswith("data: "):
92-
continue
93-
data_str = line[6:]
94-
if data_str == "[DONE]":
95-
break
96-
try:
97-
chunk = json.loads(data_str)
98-
except json.JSONDecodeError:
99-
continue
100-
101-
choices = chunk.get("choices", [])
102-
if choices and t_first is None and choices[0].get("text"):
103-
t_first = time.perf_counter()
104-
for choice in choices:
105-
t = choice.get("text", "")
106-
if t:
107-
chunks.append(t)
108-
if "usage" in chunk:
109-
usage_data = chunk["usage"]
110-
80+
timeout=(10, 300),
81+
)
11182
t1 = time.perf_counter()
112-
text = "".join(chunks)
83+
resp.raise_for_status()
84+
85+
body = resp.json()
86+
text = body["choices"][0]["text"]
11387
total_ms = (t1 - t0) * 1000.0
11488

11589
result: Dict[str, Any] = {
@@ -118,12 +92,7 @@ def _generate_single(self, prompt: str, max_tokens: int, temperature: float) ->
11892
"extra": {},
11993
}
12094

121-
# only report TTFT if we actually measured first-token arrival
122-
if t_first is not None:
123-
result["ttft_ms"] = (t_first - t0) * 1000.0
124-
result["generation_ms"] = (t1 - t_first) * 1000.0
125-
126-
# only report token counts if the server returned them
95+
usage_data = body.get("usage")
12796
if usage_data:
12897
result["extra"]["usage"] = {
12998
"input_tokens": usage_data.get("prompt_tokens", 0),
Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
run_dir,ts,backend,backend_model,workload,n,accuracy_mean,accuracy_count,api_cost_usd,cost_per_1m_tokens,electricity_cost_usd,hardware_amortization_usd,total_compute_cost_usd,memory_mb_peak,cpu_percent_avg,latency_ms_mean,latency_ms_std,latency_ms_min,latency_ms_max,latency_ms_p50,latency_ms_p95,latency_ms_cv,throughput_req_per_s,total_tokens,avg_tokens,total_input_tokens,total_output_tokens,ttft_ms_mean,generation_ms_mean,concurrency,rouge1_f,rouge2_f,rougeL_f
2-
vllm_qwen3b_math,2026-02-25T03:44:40.368334+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,math,50.0,0.6800,34/50,0.000000,0.0000,0.002787,0.053089,0.055876,149.0,2.8,1911.2006275495514,541.36,729.41,2549.99,1920.4873470589519,2544.023859174922,0.2833,0.5231831373353106,,,,,29.76,1881.44,,,,
3-
vllm_qwen3b_reasoning,2026-02-25T03:45:36.053948+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,reasoning,50.0,0.6400,32/50,0.000000,0.0000,0.001531,0.029167,0.030698,148.0,2.5,1050.0086006894708,280.14,501.65,1639.09,1048.1926704524085,1565.7067460240796,0.2668,0.9522370949510227,,,,,26.41,1023.60,,,,
4-
vllm_qwen3b_summarization,2026-02-25T03:45:57.279603+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,summarization,50.0,0.6200,31/50,0.000000,0.0000,0.000521,0.009930,0.010451,148.0,2.6,357.48520554974675,140.51,157.27,660.93,334.4448370626196,653.8579267216846,0.3931,2.7960121385357626,,,,,32.33,325.16,,0.2256,0.0561,0.1573
5-
vllm_qwen3b_json_extraction,2026-02-25T03:46:24.926495+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,json_extraction,50.0,0.5200,26/50,0.000000,0.0000,0.000756,0.014405,0.015161,146.0,2.1,518.5651833564043,167.29,297.37,972.98,457.4713610345498,782.0741413393989,0.3226,1.9278017596520927,,,,,30.67,487.89,,,,
6-
vllm_qwen3b_embeddings,2026-02-25T03:46:30.729177+00:00,vllm,Qwen/Qwen2.5-3B-Instruct,embeddings,50.0,0.9000,45/50,0.000000,0.0000,0.000069,0.001323,0.001393,152.9,7.6,47.637368431314826,3.35,42.45,53.91,46.51483451016247,52.689887944143265,0.0703,20.93053802290314,,,,,29.27,18.36,,,,
7-
systemds_qwen3b_math,2026-02-25T16:43:43.522303+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,math,50.0,0.6800,34/50,0.000000,2.4199,0.002806,0.053444,0.056250,158.0,0.2,1924.0,543.79,746.00,2568.00,1923.0,2564.0,0.2826,0.5167099295194738,23245,464.9000,4016,19229,,,,,,
8-
systemds_qwen3b_reasoning,2026-02-25T16:44:43.564082+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,reasoning,50.0,0.6000,30/50,0.000000,1.5940,0.001610,0.030667,0.032277,158.0,0.2,1104.02,325.27,542.00,2101.00,1097.0,1641.8999999999999,0.2946,0.898948616822443,20249,404.9800,9337,10912,,,,,,
9-
systemds_qwen3b_summarization,2026-02-25T16:45:06.721049+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,summarization,50.0,0.5000,25/50,0.000000,0.6825,0.000535,0.010181,0.010716,160.0,0.5,366.52,145.11,162.00,661.00,354.0,637.1499999999999,0.3959,2.6577487014516517,15701,314.0200,12242,3459,,,,0.2198,0.0566,0.1566
10-
systemds_qwen3b_json_extraction,2026-02-25T16:45:36.426217+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,json_extraction,50.0,0.5200,26/50,0.000000,1.4096,0.000771,0.014680,0.015451,157.0,0.5,528.48,169.30,306.00,983.00,465.5,797.85,0.3203,1.8517758760221859,10961,219.2200,5919,5042,,,,,,
11-
systemds_qwen3b_embeddings,2026-02-25T16:45:52.947905+00:00,systemds,Qwen/Qwen2.5-3B-Instruct,embeddings,50.0,0.9000,45/50,0.000000,0.3540,0.000068,0.001291,0.001359,158.0,2.3,46.48,7.70,40.00,93.00,44.0,57.099999999999994,0.1657,18.04542374908207,3839,76.7800,3589,250,,,,,,
12-
openai_math,2026-02-27T20:14:30.513077+00:00,openai,gpt-4.1-mini,math,50.0,0.9600,48/50,0.022309,1.3070,0.000000,0.000000,0.000000,164.6,1.6,4576.56230248,1248.05,2117.60,8749.62,4322.632333500011,6435.652603799997,0.2727,0.2184699489817528,17069,341.3800,4168,12901,778.75,3797.81,,,,
13-
openai_reasoning,2026-02-27T20:16:01.652306+00:00,openai,gpt-4.1-mini,reasoning,50.0,0.8800,44/50,0.009984,0.7771,0.000000,0.000000,0.000000,166.0,3.1,1735.1548483399995,619.02,904.09,4881.41,1688.0436875000023,2549.1829626499934,0.3568,0.5760060829950922,12848,256.9600,8811,4037,550.27,1184.88,,,,
14-
openai_summarization,2026-02-27T20:17:04.370727+00:00,openai,gpt-4.1-mini,summarization,50.0,0.8600,43/50,0.007539,0.5451,0.000000,0.000000,0.000000,176.8,3.8,1130.7603850000003,384.29,722.89,3368.68,1058.428687499999,1499.2941524499975,0.3399,0.8837403133927044,13832,276.6400,12160,1672,632.09,498.67,,0.2697,0.0660,0.2009
15-
openai_json_extraction,2026-02-27T20:18:33.535271+00:00,openai,gpt-4.1-mini,json_extraction,46.0,0.6087,28/46,0.005592,0.8311,0.000000,0.000000,0.000000,176.7,2.9,1497.5694981521735,842.46,789.29,5559.85,1283.2824784999984,2850.8575522500028,0.5625,0.6672573098693623,6728,146.2609,4311,2417,638.97,858.60,,,,
16-
openai_embeddings,2026-02-27T20:19:16.367666+00:00,openai,gpt-4.1-mini,embeddings,50.0,0.8800,44/50,0.001894,0.4580,0.000000,0.000000,0.000000,189.1,1.5,773.4331351200003,274.06,508.85,1767.04,688.1857915000014,1307.1644873500004,0.3543,1.2907764523552046,4135,82.7000,3935,200,632.07,141.36,,,,
1+
directory,backend,model,workload,n,accuracy,accuracy_mean,latency_ms_mean,throughput_rps,total_compute_cost_usd
2+
openai_embeddings,openai,gpt-4.1-mini,embeddings,50,44/50,0.88,773.4,1.29,0.001894
3+
openai_json_extraction,openai,gpt-4.1-mini,json_extraction,46,28/46,0.6086956521739131,1497.6,0.67,0.005592
4+
openai_math,openai,gpt-4.1-mini,math,50,48/50,0.96,4576.6,0.22,0.022309
5+
openai_reasoning,openai,gpt-4.1-mini,reasoning,50,44/50,0.88,1735.2,0.58,0.009984
6+
openai_summarization,openai,gpt-4.1-mini,summarization,50,43/50,0.86,1130.8,0.88,0.007539
7+
systemds_qwen3b_embeddings,systemds,Qwen/Qwen2.5-3B-Instruct,embeddings,50,45/50,0.9,46.5,18.05,0.00162
8+
systemds_qwen3b_json_extraction,systemds,Qwen/Qwen2.5-3B-Instruct,json_extraction,50,26/50,0.52,528.5,1.85,0.015788
9+
systemds_qwen3b_math,systemds,Qwen/Qwen2.5-3B-Instruct,math,50,34/50,0.68,1924.0,0.52,0.056581
10+
systemds_qwen3b_reasoning,systemds,Qwen/Qwen2.5-3B-Instruct,reasoning,50,31/50,0.62,1104.0,0.9,0.032523
11+
systemds_qwen3b_summarization,systemds,Qwen/Qwen2.5-3B-Instruct,summarization,50,25/50,0.5,366.5,2.66,0.011
12+
vllm_qwen3b_embeddings,vllm,Qwen/Qwen2.5-3B-Instruct,embeddings,50,45/50,0.9,47.6,20.93,0.001397
13+
vllm_qwen3b_json_extraction,vllm,Qwen/Qwen2.5-3B-Instruct,json_extraction,50,26/50,0.52,518.6,1.93,0.015166
14+
vllm_qwen3b_math,vllm,Qwen/Qwen2.5-3B-Instruct,math,50,34/50,0.68,1911.2,0.52,0.055881
15+
vllm_qwen3b_reasoning,vllm,Qwen/Qwen2.5-3B-Instruct,reasoning,50,33/50,0.66,1050.0,0.95,0.030703
16+
vllm_qwen3b_summarization,vllm,Qwen/Qwen2.5-3B-Instruct,summarization,50,31/50,0.62,357.5,2.8,0.010456

scripts/staging/llm-bench/results/systemds_qwen3b_embeddings/manifest.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"git_commit_hash": "5d47925ed6a7953871e90ecc5f27c0a0e7f3ac6a",
3-
"timestamp_utc": "2026-02-25T16:45:52.947905+00:00",
3+
"timestamp_utc": "2026-03-02T06:30:27.949385+00:00",
44
"python_version": "3.12.3 (main, Aug 14 2025, 17:47:21) [GCC 13.3.0]",
55
"platform": {
66
"os": "Linux",
@@ -17,26 +17,26 @@
1717
"index": 0,
1818
"name": "NVIDIA H100 PCIe",
1919
"memory_total_mb": 81559.0,
20-
"memory_used_mb": 17830.625,
21-
"memory_free_mb": 63728.375,
20+
"memory_used_mb": 30990.6875,
21+
"memory_free_mb": 50568.3125,
2222
"gpu_utilization_pct": 0,
2323
"memory_utilization_pct": 0
2424
},
2525
{
2626
"index": 1,
2727
"name": "NVIDIA H100 PCIe",
2828
"memory_total_mb": 81559.0,
29-
"memory_used_mb": 81372.1875,
30-
"memory_free_mb": 186.8125,
29+
"memory_used_mb": 74773.125,
30+
"memory_free_mb": 6785.875,
3131
"gpu_utilization_pct": 0,
3232
"memory_utilization_pct": 0
3333
},
3434
{
3535
"index": 2,
3636
"name": "NVIDIA H100 PCIe",
3737
"memory_total_mb": 81559.0,
38-
"memory_used_mb": 483.0625,
39-
"memory_free_mb": 81075.9375,
38+
"memory_used_mb": 485.5625,
39+
"memory_free_mb": 81073.4375,
4040
"gpu_utilization_pct": 0,
4141
"memory_utilization_pct": 0
4242
}
Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,53 @@
11
{
22
"n": 50.0,
3-
"latency_ms_mean": 46.48,
4-
"latency_ms_std": 7.699974025930217,
5-
"latency_ms_min": 40.0,
6-
"latency_ms_max": 93.0,
7-
"latency_ms_p50": 44.0,
8-
"latency_ms_p95": 57.099999999999994,
9-
"latency_ms_cv": 0.16566209177991,
10-
"throughput_req_per_s": 18.04542374908207,
3+
"latency_ms_mean": 41.1,
4+
"latency_ms_std": 6.014149981501958,
5+
"latency_ms_min": 38.0,
6+
"latency_ms_max": 80.0,
7+
"latency_ms_p50": 40.0,
8+
"latency_ms_p95": 45.55,
9+
"latency_ms_cv": 0.14632968324822282,
10+
"throughput_req_per_s": 20.013688481144992,
1111
"accuracy_mean": 0.9,
1212
"accuracy_count": "45/50",
1313
"total_input_tokens": 3589,
1414
"total_output_tokens": 250,
1515
"total_tokens": 3839,
16-
"memory_mb_initial": 139.0,
16+
"electricity_kwh": 0.00024288931626425536,
17+
"electricity_cost_usd": 7.286679487927661e-05,
18+
"hardware_amortization_usd": 0.0013879389500814593,
19+
"total_compute_cost_usd": 0.0014608057449607358,
20+
"memory_mb_initial": 138.0,
1721
"memory_mb_peak": 158.0,
18-
"memory_mb_avg": 154.83333333333334,
19-
"cpu_percent_avg": 2.3333333333333335,
22+
"memory_mb_avg": 154.66666666666666,
23+
"cpu_percent_avg": 2.6666666666666665,
2024
"gpu_info": {
2125
"gpu_count": 3,
2226
"gpus": [
2327
{
2428
"index": 0,
2529
"name": "NVIDIA H100 PCIe",
2630
"memory_total_mb": 81559.0,
27-
"memory_used_mb": 17830.625,
28-
"memory_free_mb": 63728.375,
31+
"memory_used_mb": 30990.6875,
32+
"memory_free_mb": 50568.3125,
2933
"gpu_utilization_pct": 0,
3034
"memory_utilization_pct": 0
3135
},
3236
{
3337
"index": 1,
3438
"name": "NVIDIA H100 PCIe",
3539
"memory_total_mb": 81559.0,
36-
"memory_used_mb": 81372.1875,
37-
"memory_free_mb": 186.8125,
40+
"memory_used_mb": 74773.125,
41+
"memory_free_mb": 6785.875,
3842
"gpu_utilization_pct": 0,
3943
"memory_utilization_pct": 0
4044
},
4145
{
4246
"index": 2,
4347
"name": "NVIDIA H100 PCIe",
4448
"memory_total_mb": 81559.0,
45-
"memory_used_mb": 483.0625,
46-
"memory_free_mb": 81075.9375,
49+
"memory_used_mb": 485.5625,
50+
"memory_free_mb": 81073.4375,
4751
"gpu_utilization_pct": 0,
4852
"memory_utilization_pct": 0
4953
}
@@ -56,33 +60,29 @@
5660
"index": 0,
5761
"name": "NVIDIA H100 PCIe",
5862
"memory_total_mb": 81559.0,
59-
"memory_used_mb": 17830.625,
60-
"memory_free_mb": 63728.375,
63+
"memory_used_mb": 30990.6875,
64+
"memory_free_mb": 50568.3125,
6165
"gpu_utilization_pct": 0,
6266
"memory_utilization_pct": 0
6367
},
6468
{
6569
"index": 1,
6670
"name": "NVIDIA H100 PCIe",
6771
"memory_total_mb": 81559.0,
68-
"memory_used_mb": 81372.1875,
69-
"memory_free_mb": 186.8125,
72+
"memory_used_mb": 74773.125,
73+
"memory_free_mb": 6785.875,
7074
"gpu_utilization_pct": 0,
7175
"memory_utilization_pct": 0
7276
},
7377
{
7478
"index": 2,
7579
"name": "NVIDIA H100 PCIe",
7680
"memory_total_mb": 81559.0,
77-
"memory_used_mb": 483.0625,
78-
"memory_free_mb": 81075.9375,
81+
"memory_used_mb": 485.5625,
82+
"memory_free_mb": 81073.4375,
7983
"gpu_utilization_pct": 0,
8084
"memory_utilization_pct": 0
8185
}
8286
]
83-
},
84-
"electricity_kwh": 0.00022594444444444439,
85-
"electricity_cost_usd": 6.778333333333332e-05,
86-
"hardware_amortization_usd": 0.001291111111111111,
87-
"total_compute_cost_usd": 0.0013588944444444442
88-
}
87+
}
88+
}

0 commit comments

Comments
 (0)