Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions bench/ctable/ctable_v_panda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import time
import numpy as np
import blosc2
import pandas as pd
from pydantic import BaseModel, Field
from typing import Annotated
import psutil

# --- 1. Tu RowModel COMPLEJO ---
class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype

class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True

# --- 2. Parámetros ---
N = 10_000_000 # 1M filas
print(f"=== BENCHMARK: 1M Filas COMPLEJAS (Listas de Listas) ===\n")

# ==========================================
# 0. GENERAR DATOS (Lista de listas COMPLEJA)
# ==========================================
print("--- Generando 1M filas complejas ---")
t0 = time.time()
data_list = []
for i in range(N):
data_list.append([
i, # id: int64
complex(i*0.1, i*0.01), # c_val: complex128
10.0 + np.sin(i*0.001)*50, # score: float64
(i % 3 == 0) # active: bool
])
t_gen = time.time() - t0
print(f"Tiempo generación: {t_gen:.4f} s")
print(f"Lista ocupa: {len(data_list):,} filas\n")

# ==========================================
# 1. PANDAS: Lista compleja -> DataFrame
# ==========================================
print("--- 1. PANDAS (Creación) ---")
gc_pandas = psutil.Process().memory_info().rss / (1024**2)
t0 = time.time()

df = pd.DataFrame(data_list, columns=['id', 'c_val', 'score', 'active'])

t_pandas_create = time.time() - t0
gc_pandas_after = psutil.Process().memory_info().rss / (1024**2)
mem_pandas = gc_pandas_after - gc_pandas
print(f"Tiempo creación: {t_pandas_create:.4f} s")
print(f"Memoria usada: {mem_pandas:.2f} MB")

# Pandas head(1000)
t0 = time.time()
df_head = df.head(N)
t_pandas_head = time.time() - t0
print(f"Tiempo head(1000): {t_pandas_head:.6f} s\n")

# ==========================================
# 2. BLOSC2 Oficial: extend() con conversión
# ==========================================
print("--- 2. BLOSC2 Oficial (extend + conversión Pydantic) ---")
gc_blosc = psutil.Process().memory_info().rss / (1024**2)
t0 = time.time()

# ❌ Blosc2 oficial REQUIERE conversión a modelos
ctable = blosc2.CTable(RowModel, expected_size=N)
ctable.extend(data_list)

t_blosc_create = time.time() - t0
gc_blosc_after = psutil.Process().memory_info().rss / (1024**2)
mem_blosc = gc_blosc_after - gc_blosc
mem_compressed = sum(col.schunk.nbytes for col in ctable._cols.values()) / (1024**2)
print(f"Tiempo creación: {t_blosc_create:.4f} s")
total_comprimido = sum(col.cbytes for col in ctable._cols.values()) + ctable._valid_rows.cbytes
total_sin_comprimir = sum(col.nbytes for col in ctable._cols.values()) + ctable._valid_rows.nbytes

print(f"Comprimido: {total_comprimido / 1024 ** 2:.2f} MB")
print(f"Sin comprimir: {total_sin_comprimir / 1024 ** 2:.2f} MB")
print(f"Ratio: {total_sin_comprimir/total_comprimido:.2}x")

t0 = time.time()
ctable_head = ctable.head(N)
t_blosc_head = time.time() - t0
print(f"Tiempo head(1000): {t_blosc_head:.6f} s\n")



# ==========================================
# 🏆 RESUMEN COMPLETO
# ==========================================
print("═" * 80)
print("🥇 BENCHMARK 1M FILAS COMPLEJAS (int64+complex128+float64+bool)")
print("═" * 80)
print(f"{'MÉTRICA':<22} {'PANDAS':>12} {'BLOsc2*':>10} {'TU CTable':>12}")
print(f"{'':<22} {'':>12} {'*+Pydantic':>10} {'¡Directo!':>12}")
print("-" * 80)
75 changes: 75 additions & 0 deletions bench/ctable/expected_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark for measuring the overhead of resize() when expected_size
# is too small (M rows) vs correctly sized (N rows) during extend().

from time import time
from typing import Annotated

import numpy as np
from pydantic import BaseModel, Field

import blosc2


class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype


# Row model
class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True



M = 50
N = 62_500
MAX_N = 1_000_000
print(f"expected_size benchmark | wrong expected_size = {M}")

# Pre-generate full dataset once
np_dtype = np.dtype([
("id", np.int64),
("c_val", np.complex128),
("score", np.float64),
("active", np.bool_),
])
DATA = np.array(
[
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
for i in range(MAX_N)
],
dtype=np_dtype,
)

while N <= MAX_N:
print("-" * 80)
print(f"N = {N:,} rows")

# 1. extend() with correct expected_size = N
ct_correct = blosc2.CTable(RowModel, expected_size=N)
t0 = time()
ct_correct.extend(DATA[:N])
t_correct = time() - t0
print(f"extend() expected_size=N ({N:>8,}): {t_correct:.4f} s rows: {len(ct_correct):,}")

# 2. extend() with wrong expected_size = M (forces resize)
ct_wrong = blosc2.CTable(RowModel, expected_size=M)
t0 = time()
ct_wrong.extend(DATA[:N])
t_wrong = time() - t0
print(f"extend() expected_size=M ({M:>8,}): {t_wrong:.4f} s rows: {len(ct_wrong):,}")

# Summary
print(f" Slowdown from wrong expected_size: {t_wrong / t_correct:.2f}x")

N *= 2
117 changes: 117 additions & 0 deletions bench/ctable/extend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark for measuring CTable creation time from three different sources:
# 1. Python list of lists (1M rows)
# 2. NumPy structured array (1M rows) — list of named tuples
# 3. An existing CTable (previously created from Python lists, 1M rows)

from time import time
from typing import Annotated

import numpy as np
from pydantic import BaseModel, Field

import blosc2


class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype





# ---------------------------------------------------------------------------
# Row model
# ---------------------------------------------------------------------------
class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True


N = 1_000_000
print(f"CTable creation benchmark with {N:,} rows\n")

# ---------------------------------------------------------------------------
# Base data generation (not part of the benchmark timing)
# ---------------------------------------------------------------------------
print("Generating base data...")

t0 = time()
data_list = [
[i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0]
for i in range(N)
]
t_gen_list = time() - t0
print(f" Python list generated in: {t_gen_list:.4f} s")

t0 = time()
np_dtype = np.dtype([
("id", np.int64),
("c_val", np.complex128),
("score", np.float64),
("active", np.bool_),
])
data_np = np.array(
[
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
for i in range(N)
],
dtype=np_dtype,
)
t_gen_np = time() - t0
print(f" NumPy structured array generated: {t_gen_np:.4f} s\n")

# ---------------------------------------------------------------------------
# 1. Creation from a Python list of lists
# ---------------------------------------------------------------------------
print("CTable from Python list of lists")
t0 = time()
ct_from_list = blosc2.CTable(RowModel, expected_size=N)
ct_from_list.extend(data_list)
t_from_list = time() - t0
print(f" extend() time (Python list): {t_from_list:.4f} s")
print(f" Rows: {len(ct_from_list):,}")

# ---------------------------------------------------------------------------
# 2. Creation from a NumPy structured array (list of named tuples)
# ---------------------------------------------------------------------------
print("CTable from NumPy structured array")
t0 = time()
ct_from_np = blosc2.CTable(RowModel, expected_size=N)
ct_from_np.extend(data_np)
t_from_np = time() - t0
print(f" extend() time (NumPy struct): {t_from_np:.4f} s")
print(f" Rows: {len(ct_from_np):,}")


# ---------------------------------------------------------------------------
# 3. Creation from an existing CTable (ct_from_list, already built above)
# ---------------------------------------------------------------------------
print("CTable from an existing CTable")
t0 = time()
ct_from_ctable = blosc2.CTable(RowModel, expected_size=N)
ct_from_ctable.extend(ct_from_list)
t_from_ctable = time() - t0
print(f" extend() time (CTable): {t_from_ctable:.4f} s")
print(f" Rows: {len(ct_from_ctable):,}")

# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
print("\n")
print("=" * 60)
print(f"{'SOURCE':<30} {'TIME (s)':>12} {'SPEEDUP vs list':>18}")
print("-" * 60)
print(f"{'Python list of lists':<30} {t_from_list:>12.4f} {'1.00x':>18}")
print(f"{'NumPy structured array':<30} {t_from_np:>12.4f} {t_from_list / t_from_np:>17.2f}x")
print(f"{'Existing CTable':<30} {t_from_ctable:>12.4f} {t_from_list / t_from_ctable:>17.2f}x")

84 changes: 84 additions & 0 deletions bench/ctable/extend_vs_apend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark for comparing append() (row by row) vs extend() (bulk),
# to find the crossover point where extend() becomes worth it.

from time import time
from typing import Annotated

import numpy as np
from pydantic import BaseModel, Field

import blosc2


class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype


# Row model
class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True


# Parameter — change N to test different crossover points
N = 2
print(f"append() vs extend() benchmark")
for i in range(6):
print("\n")
print("%" * 100)


# Base data generation
data_list = [
[i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] for i in range(N)
]

# 1. N individual append() calls
print(f"{N} individual append() calls")
ct_append = blosc2.CTable(RowModel, expected_size=N)
t0 = time()
for row in data_list:
ct_append.append(row)
t_append = time() - t0
print(f" Time: {t_append:.6f} s")
print(f" Rows: {len(ct_append):,}")

# 2. N individual extend() calls (one row at a time)
print(f"{N} individual extend() calls (one row at a time)")
ct_extend_one = blosc2.CTable(RowModel, expected_size=N)
t0 = time()
for row in data_list:
ct_extend_one.extend([row])
t_extend_one = time() - t0
print(f" Time: {t_extend_one:.6f} s")
print(f" Rows: {len(ct_extend_one):,}")

# 3. Single extend() call with all N rows at once
print(f"Single extend() call with all {N} rows at once")
ct_extend_bulk = blosc2.CTable(RowModel, expected_size=N)
t0 = time()
ct_extend_bulk.extend(data_list)
t_extend_bulk = time() - t0
print(f" Time: {t_extend_bulk:.6f} s")
print(f" Rows: {len(ct_extend_bulk):,}")

# Summary
print("=" * 70)
print(f"{'METHOD':<35} {'TIME (s)':>12} {'SPEEDUP vs append':>20}")
print("-" * 70)
print(f"{'append() x N':<35} {t_append:>12.6f} {'1.00x':>20}")
print(f"{'extend() x N (one row each)':<35} {t_extend_one:>12.6f} {t_append / t_extend_one:>19.2f}x")
print(f"{'extend() x 1 (all at once)':<35} {t_extend_bulk:>12.6f} {t_append / t_extend_bulk:>19.2f}x")
print("-" * 70)

N=N*2
Loading