Blosc · Jacc4224 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/bench/ctable/ctable_v_panda.py b/bench/ctable/ctable_v_panda.py
@@ -0,0 +1,100 @@
+import time
+import numpy as np
+import blosc2
+import pandas as pd
+from pydantic import BaseModel, Field
+from typing import Annotated
+import psutil
+
+# --- 1. Tu RowModel COMPLEJO ---
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+# --- 2. Parámetros ---
+N = 10_000_000  # 1M filas
+print(f"=== BENCHMARK: 1M Filas COMPLEJAS (Listas de Listas) ===\n")
+
+# ==========================================
+# 0. GENERAR DATOS (Lista de listas COMPLEJA)
+# ==========================================
+print("--- Generando 1M filas complejas ---")
+t0 = time.time()
+data_list = []
+for i in range(N):
+    data_list.append([
+        i,                                    # id: int64
+        complex(i*0.1, i*0.01),              # c_val: complex128
+        10.0 + np.sin(i*0.001)*50,           # score: float64
+        (i % 3 == 0)                         # active: bool
+    ])
+t_gen = time.time() - t0
+print(f"Tiempo generación: {t_gen:.4f} s")
+print(f"Lista ocupa: {len(data_list):,} filas\n")
+
+# ==========================================
+# 1. PANDAS: Lista compleja -> DataFrame
+# ==========================================
+print("--- 1. PANDAS (Creación) ---")
+gc_pandas = psutil.Process().memory_info().rss / (1024**2)
+t0 = time.time()
+
+df = pd.DataFrame(data_list, columns=['id', 'c_val', 'score', 'active'])
+
+t_pandas_create = time.time() - t0
+gc_pandas_after = psutil.Process().memory_info().rss / (1024**2)
+mem_pandas = gc_pandas_after - gc_pandas
+print(f"Tiempo creación:  {t_pandas_create:.4f} s")
+print(f"Memoria usada:    {mem_pandas:.2f} MB")
+
+# Pandas head(1000)
+t0 = time.time()
+df_head = df.head(N)
+t_pandas_head = time.time() - t0
+print(f"Tiempo head(1000): {t_pandas_head:.6f} s\n")
+
+# ==========================================
+# 2. BLOSC2 Oficial: extend() con conversión
+# ==========================================
+print("--- 2. BLOSC2 Oficial (extend + conversión Pydantic) ---")
+gc_blosc = psutil.Process().memory_info().rss / (1024**2)
+t0 = time.time()
+
+# ❌ Blosc2 oficial REQUIERE conversión a modelos
+ctable = blosc2.CTable(RowModel, expected_size=N)
+ctable.extend(data_list)
+
+t_blosc_create = time.time() - t0
+gc_blosc_after = psutil.Process().memory_info().rss / (1024**2)
+mem_blosc = gc_blosc_after - gc_blosc
+mem_compressed = sum(col.schunk.nbytes for col in ctable._cols.values()) / (1024**2)
+print(f"Tiempo creación:  {t_blosc_create:.4f} s")
+total_comprimido = sum(col.cbytes for col in ctable._cols.values()) + ctable._valid_rows.cbytes
+total_sin_comprimir = sum(col.nbytes for col in ctable._cols.values()) + ctable._valid_rows.nbytes
+
+print(f"Comprimido: {total_comprimido / 1024 ** 2:.2f} MB")
+print(f"Sin comprimir: {total_sin_comprimir / 1024 ** 2:.2f} MB")
+print(f"Ratio: {total_sin_comprimir/total_comprimido:.2}x")
+
+t0 = time.time()
+ctable_head = ctable.head(N)
+t_blosc_head = time.time() - t0
+print(f"Tiempo head(1000): {t_blosc_head:.6f} s\n")
+
+
+
+# ==========================================
+# 🏆 RESUMEN COMPLETO
+# ==========================================
+print("═" * 80)
+print("🥇 BENCHMARK 1M FILAS COMPLEJAS (int64+complex128+float64+bool)")
+print("═" * 80)
+print(f"{'MÉTRICA':<22} {'PANDAS':>12} {'BLOsc2*':>10} {'TU CTable':>12}")
+print(f"{'':<22} {'':>12} {'*+Pydantic':>10} {'¡Directo!':>12}")
+print("-" * 80)
diff --git a/bench/ctable/expected_size.py b/bench/ctable/expected_size.py
@@ -0,0 +1,75 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring the overhead of resize() when expected_size
+# is too small (M rows) vs correctly sized (N rows) during extend().
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+
+M = 50
+N = 62_500
+MAX_N = 1_000_000
+print(f"expected_size benchmark  |  wrong expected_size = {M}")
+
+# Pre-generate full dataset once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(MAX_N)
+    ],
+    dtype=np_dtype,
+)
+
+while N <= MAX_N:
+    print("-" * 80)
+    print(f"N = {N:,} rows")
+
+    # 1. extend() with correct expected_size = N
+    ct_correct = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    ct_correct.extend(DATA[:N])
+    t_correct = time() - t0
+    print(f"extend() expected_size=N  ({N:>8,}):  {t_correct:.4f} s   rows: {len(ct_correct):,}")
+
+    # 2. extend() with wrong expected_size = M (forces resize)
+    ct_wrong = blosc2.CTable(RowModel, expected_size=M)
+    t0 = time()
+    ct_wrong.extend(DATA[:N])
+    t_wrong = time() - t0
+    print(f"extend() expected_size=M  ({M:>8,}):  {t_wrong:.4f} s   rows: {len(ct_wrong):,}")
+
+    # Summary
+    print(f"  Slowdown from wrong expected_size: {t_wrong / t_correct:.2f}x")
+
+    N *= 2
diff --git a/bench/ctable/extend.py b/bench/ctable/extend.py
@@ -0,0 +1,117 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring CTable creation time from three different sources:
+#   1. Python list of lists (1M rows)
+#   2. NumPy structured array (1M rows) — list of named tuples
+#   3. An existing CTable (previously created from Python lists, 1M rows)
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+
+
+
+# ---------------------------------------------------------------------------
+# Row model
+# ---------------------------------------------------------------------------
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+print(f"CTable creation benchmark with {N:,} rows\n")
+
+# ---------------------------------------------------------------------------
+# Base data generation (not part of the benchmark timing)
+# ---------------------------------------------------------------------------
+print("Generating base data...")
+
+t0 = time()
+data_list = [
+    [i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0]
+    for i in range(N)
+]
+t_gen_list = time() - t0
+print(f"  Python list generated in:         {t_gen_list:.4f} s")
+
+t0 = time()
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+data_np = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+t_gen_np = time() - t0
+print(f"  NumPy structured array generated: {t_gen_np:.4f} s\n")
+
+# ---------------------------------------------------------------------------
+# 1. Creation from a Python list of lists
+# ---------------------------------------------------------------------------
+print("CTable from Python list of lists")
+t0 = time()
+ct_from_list = blosc2.CTable(RowModel, expected_size=N)
+ct_from_list.extend(data_list)
+t_from_list = time() - t0
+print(f"   extend() time (Python list):  {t_from_list:.4f} s")
+print(f"   Rows: {len(ct_from_list):,}")
+
+# ---------------------------------------------------------------------------
+# 2. Creation from a NumPy structured array (list of named tuples)
+# ---------------------------------------------------------------------------
+print("CTable from NumPy structured array")
+t0 = time()
+ct_from_np = blosc2.CTable(RowModel, expected_size=N)
+ct_from_np.extend(data_np)
+t_from_np = time() - t0
+print(f"   extend() time (NumPy struct): {t_from_np:.4f} s")
+print(f"   Rows: {len(ct_from_np):,}")
+
+
+# ---------------------------------------------------------------------------
+# 3. Creation from an existing CTable (ct_from_list, already built above)
+# ---------------------------------------------------------------------------
+print("CTable from an existing CTable")
+t0 = time()
+ct_from_ctable = blosc2.CTable(RowModel, expected_size=N)
+ct_from_ctable.extend(ct_from_list)
+t_from_ctable = time() - t0
+print(f"   extend() time (CTable):       {t_from_ctable:.4f} s")
+print(f"   Rows: {len(ct_from_ctable):,}")
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+print("\n")
+print("=" * 60)
+print(f"{'SOURCE':<30} {'TIME (s)':>12} {'SPEEDUP vs list':>18}")
+print("-" * 60)
+print(f"{'Python list of lists':<30} {t_from_list:>12.4f} {'1.00x':>18}")
+print(f"{'NumPy structured array':<30} {t_from_np:>12.4f} {t_from_list / t_from_np:>17.2f}x")
+print(f"{'Existing CTable':<30} {t_from_ctable:>12.4f} {t_from_list / t_from_ctable:>17.2f}x")
+
diff --git a/bench/ctable/extend_vs_apend.py b/bench/ctable/extend_vs_apend.py
@@ -0,0 +1,84 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for comparing append() (row by row) vs extend() (bulk),
+# to find the crossover point where extend() becomes worth it.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+# Parameter — change N to test different crossover points
+N = 2
+print(f"append() vs extend() benchmark")
+for i in range(6):
+    print("\n")
+    print("%" * 100)
+
+
+    # Base data generation
+    data_list = [
+        [i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] for i in range(N)
+    ]
+
+    # 1. N individual append() calls
+    print(f"{N} individual append() calls")
+    ct_append = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    for row in data_list:
+        ct_append.append(row)
+    t_append = time() - t0
+    print(f"   Time: {t_append:.6f} s")
+    print(f"   Rows: {len(ct_append):,}")
+
+    # 2. N individual extend() calls (one row at a time)
+    print(f"{N} individual extend() calls (one row at a time)")
+    ct_extend_one = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    for row in data_list:
+        ct_extend_one.extend([row])
+    t_extend_one = time() - t0
+    print(f"   Time: {t_extend_one:.6f} s")
+    print(f"   Rows: {len(ct_extend_one):,}")
+
+    # 3. Single extend() call with all N rows at once
+    print(f"Single extend() call with all {N} rows at once")
+    ct_extend_bulk = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    ct_extend_bulk.extend(data_list)
+    t_extend_bulk = time() - t0
+    print(f"   Time: {t_extend_bulk:.6f} s")
+    print(f"   Rows: {len(ct_extend_bulk):,}")
+
+    # Summary
+    print("=" * 70)
+    print(f"{'METHOD':<35} {'TIME (s)':>12} {'SPEEDUP vs append':>20}")
+    print("-" * 70)
+    print(f"{'append() x N':<35} {t_append:>12.6f} {'1.00x':>20}")
+    print(f"{'extend() x N (one row each)':<35} {t_extend_one:>12.6f} {t_append / t_extend_one:>19.2f}x")
+    print(f"{'extend() x 1 (all at once)':<35} {t_extend_bulk:>12.6f} {t_append / t_extend_bulk:>19.2f}x")
+    print("-" * 70)
+
+    N=N*2