commaai · haraschax · May 20, 2026
diff --git a/extra/hevc/decode.py b/extra/hevc/decode.py
@@ -8,7 +8,7 @@
 HEVC_ROUNDUP = getenv("DATA_ROUNDUP", 32)
 
 @functools.cache
-def _hevc_jitted_decoder(out_image_size:tuple[int, int], max_hist:int, inplace:bool):
+def _hevc_jitted_decoder(out_image_size:tuple[int, int], max_hist:int, inplace:bool, opaque_len:int):
   def hevc_decode_frame(pos:Variable, hevc_tensor:Tensor, offset:Variable, sz:Variable, opaque:Tensor, i:Variable, *hist:Tensor, outbuf:Tensor|None=None):
     x = hevc_tensor[offset:offset+sz*HEVC_ROUNDUP].decode_hevc_frame(pos, out_image_size, opaque[i], hist).realize()
     if outbuf is not None: outbuf.assign(x).realize()
@@ -25,7 +25,7 @@ def hevc_decode(hevc_tensor:Tensor, opaque:Tensor, frame_info:list, luma_h:int,
   v_sz = Variable("sz", 1, ceildiv(hevc_tensor.numel(), HEVC_ROUNDUP))
   v_i = Variable("i", 0, len(frame_info)-1)
 
-  decode_jit = _hevc_jitted_decoder(out_image_size, max_hist, preallocated_outputs is not None)
+  decode_jit = _hevc_jitted_decoder(out_image_size, max_hist, preallocated_outputs is not None, opaque.shape[0])
   history = history or [Tensor.empty(*out_image_size, dtype=dtypes.uint8, device="NV").contiguous().realize() for _ in range(max_hist)]
   assert len(history) == max_hist, f"history length {len(history)} does not match max_hist {max_hist}"
 
@@ -37,6 +37,27 @@ def hevc_decode(hevc_tensor:Tensor, opaque:Tensor, frame_info:list, luma_h:int,
     if is_hist: history.append(res)
     yield res
 
+class HevcPacketDecoder:
+  def __init__(self, header:bytes=b"", packets=(), device="NV"):
+    self.device, self.packet_pos, self.frame_pos = device, len(header), 0
+    self.dat = header + b"".join(packets)
+    self.opaque, self.frame_info, self.w, self.h, self.luma_w, self.luma_h, self.chroma_off = parse_hevc_file_headers(self.dat, device=device)
+    self.max_hist = max((h for *_, h, _ in self.frame_info), default=0)
+    self.tensor = Tensor(self.dat, device=device)
+
+  def Decode(self, packet) -> list[Tensor]:
+    self.packet_pos += len(packet)
+    start = self.frame_pos
+    while self.frame_pos < len(self.frame_info) and self.frame_info[self.frame_pos][0] < self.packet_pos: self.frame_pos += 1
+    if not hasattr(self, "frames"):
+      frame_info = [(a, b, c, self.max_hist, d) for a, b, c, _, d in self.frame_info]
+      self.frames = list(hevc_decode(self.tensor, self.opaque.contiguous().realize(), frame_info, self.luma_h, self.luma_w))
+    return self.frames[start:self.frame_pos]
+
+  def to_rgb(self, frame:Tensor) -> Tensor:
+    return to_bgr(frame, self.h, self.w, self.luma_w, self.chroma_off).flip(2).realize()
+
+
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.add_argument("--input_file", type=str, default="")

diff --git a/extra/hevc/hevc.py b/extra/hevc/hevc.py
@@ -266,7 +266,7 @@ def fill_pps_into_dev_context(device_ctx, pps:PPS):
 
 def parse_hevc_file_headers(dat:bytes, device="NV"):
   res = []
-  nal_unit_start = 1
+  nal_unit_start = dat.index(b"\x00\x00\x01")
   history:list[tuple[int, int, int]] = []
   device_ctx = nv_gpu.nvdec_hevc_pic_s(gptimer_timeout_value=92720000, tileformat=1, sw_start_code_e=1, pattern_id=2)
   nal_infos = []

diff --git a/test/testextra/test_hevc.py b/test/testextra/test_hevc.py
@@ -3,7 +3,7 @@
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.helpers import fetch, round_up
 from extra.hevc.hevc import parse_hevc_file_headers, nv_gpu
-from extra.hevc.decode import hevc_decode
+from extra.hevc.decode import HevcPacketDecoder, hevc_decode
 
 class TestHevc(unittest.TestCase):
   def test_hevc_parser(self):
@@ -17,6 +17,7 @@ def _test_common(frame, bts):
       self.assertEqual(frame0.pic_height_in_luma_samples, 1216)
       self.assertEqual(frame0.chroma_format_idc, 1)
       self.assertEqual(frame0.bit_depth_luma, 8)
+      self.assertEqual(parse_hevc_file_headers(dat[1:], device=Device.DEFAULT)[2:4], (w, h))
       self.assertEqual(frame0.bit_depth_chroma, 8)
       self.assertEqual(frame0.log2_min_luma_coding_block_size, 3)
       self.assertEqual(frame0.log2_max_luma_coding_block_size, 5)
@@ -83,5 +84,16 @@ def test_hevc_decode(self):
       self.assertEqual(f.dtype, dtypes.uint8)
       self.assertEqual(f.device, "NV")
 
+    header = dat[:frame_info[0][0]]
+    packets = [dat[offset:offset+sz] for offset, sz, *_ in frame_info]
+    decoder = HevcPacketDecoder(header, packets)
+    for expected, packet in zip(frames, packets):
+      decoded = decoder.Decode(packet)
+      self.assertEqual(len(decoded), 1)
+      Device.default.synchronize()
+      self.assertEqual(bytes(decoded[0].data()), bytes(expected.data()))
+    rgb = decoder.to_rgb(frames[0])
+    self.assertEqual(rgb.shape, (h, w, 3))
+
 if __name__ == "__main__":
   unittest.main()
diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py
@@ -437,7 +437,8 @@ def setup_usermode(self):
     self.gpfifo_class:int = next(c for c in [nv_gpu.BLACKWELL_CHANNEL_GPFIFO_A, nv_gpu.AMPERE_CHANNEL_GPFIFO_A] if c in self.nvclasses)
     self.compute_class:int = next(c for c in [nv_gpu.BLACKWELL_COMPUTE_B, nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if c in self.nvclasses)
     self.dma_class:int = next(c for c in [nv_gpu.BLACKWELL_DMA_COPY_B, nv_gpu.AMPERE_DMA_COPY_B] if c in self.nvclasses)
-    self.viddec_class:int|None = next((c for c in [nv_gpu.NVCFB0_VIDEO_DECODER, nv_gpu.NVC9B0_VIDEO_DECODER] if c in self.nvclasses), None)
+    self.viddec_class:int|None = next((c for c in [nv_gpu.NVCFB0_VIDEO_DECODER, nv_gpu.NVCDB0_VIDEO_DECODER, nv_gpu.NVC9B0_VIDEO_DECODER,
+      nv_gpu.NVC7B0_VIDEO_DECODER, nv_gpu.NVC6B0_VIDEO_DECODER, nv_gpu.NVC4B0_VIDEO_DECODER, nv_gpu.NVB8B0_VIDEO_DECODER] if c in self.nvclasses), None)
 
     usermode = self.rm_alloc(self.dev.subdevice, self.usermode_class)
     return usermode, MMIOInterface(self._gpu_map_to_cpu(usermode, mmio_sz:=0x10000), mmio_sz, fmt='I')