diff --git a/bench/ctable/ctable_v_panda.py b/bench/ctable/ctable_v_panda.py new file mode 100644 index 00000000..8d22169a --- /dev/null +++ b/bench/ctable/ctable_v_panda.py @@ -0,0 +1,100 @@ +import time +import numpy as np +import blosc2 +import pandas as pd +from pydantic import BaseModel, Field +from typing import Annotated +import psutil + +# --- 1. Tu RowModel COMPLEJO --- +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + +# --- 2. Parámetros --- +N = 10_000_000 # 1M filas +print(f"=== BENCHMARK: 1M Filas COMPLEJAS (Listas de Listas) ===\n") + +# ========================================== +# 0. GENERAR DATOS (Lista de listas COMPLEJA) +# ========================================== +print("--- Generando 1M filas complejas ---") +t0 = time.time() +data_list = [] +for i in range(N): + data_list.append([ + i, # id: int64 + complex(i*0.1, i*0.01), # c_val: complex128 + 10.0 + np.sin(i*0.001)*50, # score: float64 + (i % 3 == 0) # active: bool + ]) +t_gen = time.time() - t0 +print(f"Tiempo generación: {t_gen:.4f} s") +print(f"Lista ocupa: {len(data_list):,} filas\n") + +# ========================================== +# 1. PANDAS: Lista compleja -> DataFrame +# ========================================== +print("--- 1. PANDAS (Creación) ---") +gc_pandas = psutil.Process().memory_info().rss / (1024**2) +t0 = time.time() + +df = pd.DataFrame(data_list, columns=['id', 'c_val', 'score', 'active']) + +t_pandas_create = time.time() - t0 +gc_pandas_after = psutil.Process().memory_info().rss / (1024**2) +mem_pandas = gc_pandas_after - gc_pandas +print(f"Tiempo creación: {t_pandas_create:.4f} s") +print(f"Memoria usada: {mem_pandas:.2f} MB") + +# Pandas head(1000) +t0 = time.time() +df_head = df.head(N) +t_pandas_head = time.time() - t0 +print(f"Tiempo head(1000): {t_pandas_head:.6f} s\n") + +# ========================================== +# 2. BLOSC2 Oficial: extend() con conversión +# ========================================== +print("--- 2. BLOSC2 Oficial (extend + conversión Pydantic) ---") +gc_blosc = psutil.Process().memory_info().rss / (1024**2) +t0 = time.time() + +# ❌ Blosc2 oficial REQUIERE conversión a modelos +ctable = blosc2.CTable(RowModel, expected_size=N) +ctable.extend(data_list) + +t_blosc_create = time.time() - t0 +gc_blosc_after = psutil.Process().memory_info().rss / (1024**2) +mem_blosc = gc_blosc_after - gc_blosc +mem_compressed = sum(col.schunk.nbytes for col in ctable._cols.values()) / (1024**2) +print(f"Tiempo creación: {t_blosc_create:.4f} s") +total_comprimido = sum(col.cbytes for col in ctable._cols.values()) + ctable._valid_rows.cbytes +total_sin_comprimir = sum(col.nbytes for col in ctable._cols.values()) + ctable._valid_rows.nbytes + +print(f"Comprimido: {total_comprimido / 1024 ** 2:.2f} MB") +print(f"Sin comprimir: {total_sin_comprimir / 1024 ** 2:.2f} MB") +print(f"Ratio: {total_sin_comprimir/total_comprimido:.2}x") + +t0 = time.time() +ctable_head = ctable.head(N) +t_blosc_head = time.time() - t0 +print(f"Tiempo head(1000): {t_blosc_head:.6f} s\n") + + + +# ========================================== +# 🏆 RESUMEN COMPLETO +# ========================================== +print("═" * 80) +print("🥇 BENCHMARK 1M FILAS COMPLEJAS (int64+complex128+float64+bool)") +print("═" * 80) +print(f"{'MÉTRICA':<22} {'PANDAS':>12} {'BLOsc2*':>10} {'TU CTable':>12}") +print(f"{'':<22} {'':>12} {'*+Pydantic':>10} {'¡Directo!':>12}") +print("-" * 80) diff --git a/bench/ctable/print.py b/bench/ctable/print.py new file mode 100644 index 00000000..f8820036 --- /dev/null +++ b/bench/ctable/print.py @@ -0,0 +1,81 @@ +import time +import numpy as np +import pandas as pd +import blosc2 +from pydantic import BaseModel, Field + +# --- 1. Definir el Modelo --- +class RowModel(BaseModel): + id: int = Field(ge=0) + name: bytes = Field(default=b"unknown", max_length=10) + score: float + +# --- 2. Parámetros --- +N = 100_000 +row_data = {"id": 1, "name": b"benchmark", "score": 3.14} + +print(f"=== BENCHMARK: Ingestión Iterativa ({N} filas) ===\n") + +# ========================================== +# TEST PANDAS (Baseline) +# ========================================== +print("--- 1. PANDAS (Lista -> DataFrame) ---") +t0 = time.time() + +buffer_list = [] +for _ in range(N): + buffer_list.append(row_data) + +df = pd.DataFrame(buffer_list) +t_pandas = time.time() - t0 + +print(f"Tiempo Total: {t_pandas:.4f} s") +mem_pandas = df.memory_usage(deep=True).sum() / (1024**2) +print(f"Memoria RAM: {mem_pandas:.2f} MB") + +print("\n--- PANDAS: Primeras 1000 líneas ---") +t0_print = time.time() +print(df.head(1000).to_string()) +t_print_pandas = time.time() - t0_print +print(f"\nTiempo de impresión: {t_print_pandas:.4f} s") + + +# ========================================== +# TEST BLOSC2 (Estrategia: extend() con lista) +# ========================================== +print("\n" + "="*60) +print("--- 2. BLOSC2 (extend con lista de dicts) ---") +t0 = time.time() + +# Acumular en lista de diccionarios +buffer_list_2 = [] +for _ in range(N): + buffer_list_2.append(row_data) + +# Crear CTable vacía e insertar todo de golpe +ctable = blosc2.CTable(RowModel) +ctable.extend(buffer_list_2) + +t_blosc_extend = time.time() - t0 +print(f"Tiempo Total: {t_blosc_extend:.4f} s") + +mem_blosc_extend = sum(col.schunk.nbytes for col in ctable._cols.values()) / (1024**2) +print(f"Memoria (Compr): {mem_blosc_extend:.2f} MB") + +print("\n--- BLOSC2: Primeras 1000 líneas ---") +t0_print = time.time() +ctable_head = ctable.head(1000) +print(ctable_head) +t_print_blosc = time.time() - t0_print +print(f"\nTiempo de impresión: {t_print_blosc:.4f} s") + +# ========================================== +# CONCLUSIONES +# ========================================== +print("\n" + "="*60) +print("--- RESUMEN ---") +print(f"Pandas (lista->df): {t_pandas:.4f} s") +print(f"Blosc2 (extend): {t_blosc_extend:.4f} s ({t_pandas/t_blosc_extend:.2f}x {'más rápido' if t_blosc_extend < t_pandas else 'más lento'})") +print(f"\nImpresión Pandas: {t_print_pandas:.4f} s") +print(f"Impresión Blosc2: {t_print_blosc:.4f} s") +print(f"\nCompresión Blosc2 vs Pandas: {mem_blosc_extend / mem_pandas * 100:.2f}% del tamaño") diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 24449554..7cf1560b 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -589,6 +589,7 @@ def _raise(exc): """ # Delayed imports for avoiding overwriting of python builtins +from .ctable import CTable from .ndarray import ( abs, acos, @@ -796,6 +797,7 @@ def _raise(exc): "count_nonzero", "cparams_dflts", "cpu_info", + "ctable", "cumulative_prod", "cumulative_sum", "decompress", diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py new file mode 100644 index 00000000..018c050a --- /dev/null +++ b/src/blosc2/ctable.py @@ -0,0 +1,788 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""Imports for CTable""" + +from __future__ import annotations # ✅ PRIMERO (después de docstring) + +from collections.abc import Iterable # ✅ AHORA SÍ +from typing import Any, Generic, TypeVar + +import numpy as np +from line_profiler import profile +from pydantic import BaseModel + +import blosc2 +from blosc2 import compute_chunks_blocks + +RowT = TypeVar("RowT", bound=BaseModel) + + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class MaxLen: + def __init__(self, length: int): + self.length = int(length) + + +""" +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + +'''class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int16)] = Field(ge=0) + name: Annotated[str, MaxLen(10)] = Field(default="unknown") + # name: Annotated[bytes, MaxLen(10)] = Field(default=b"unknown") + score: Annotated[float, NumpyDtype(np.float32)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True''' + +class RowModel2(BaseModel): + id: Annotated[int, NumpyDtype(np.int16)] = Field(ge=0) + #name: Annotated[str, MaxLen(10)] = Field(default="unknown") + name: Annotated[bytes, MaxLen(10)] = Field(default=b"unknown")""" + + + + + +class _RowIndexer: + def __init__(self, table): + self._table = table + + def __getitem__(self, item): + return self._table._run_row_logic(item) + + +class Column: + def __init__(self, table: CTable, col_name: str): + self._table = table + self._col_name = col_name + + @property + def _raw_col(self): + return self._table._cols[self._col_name] + + @property + def _valid_rows(self): + return self._table._valid_rows + + def __getitem__(self, key: int | slice | list | np.ndarray): + if isinstance(key, int): + n_rows = len(self) + if key < 0: + key += n_rows + if not (0 <= key < n_rows): + raise IndexError(f"index {key} is out of bounds for column with size {n_rows}") + + arr = self._valid_rows + count = 0 + chunk_size = arr.chunks[0] + pos_true = -1 + + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + + if count + actual_size <= key: + count += actual_size + continue + + pos_true = chunk_start + (key - count) + break + + chunk_data = arr[chunk_start: chunk_start + actual_size] + n_true = int(np.count_nonzero(chunk_data)) + + if count + n_true <= key: + count += n_true + continue + + pos_true = chunk_start + int(np.flatnonzero(chunk_data)[key - count]) + break + + if pos_true == -1: + raise IndexError("Unexpected error finding physical index.") + + return self._raw_col[int(pos_true)] + + elif isinstance(key, slice): + real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() + lindices = range(*key.indices(len(real_pos))) + phys_indices = np.array([real_pos[i] for i in lindices], dtype=np.int64) + return self._raw_col[phys_indices] + + elif isinstance(key, (list, tuple, np.ndarray)): + real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() + phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64) + return self._raw_col[phys_indices] + + raise TypeError(f"Invalid index type: {type(key)}") + + def __setitem__(self, key: int | slice | list | np.ndarray, value): # noqa: C901 + if isinstance(key, int): + n_rows = len(self) + if key < 0: + key += n_rows + if not (0 <= key < n_rows): + raise IndexError(f"index {key} is out of bounds for column with size {n_rows}") + + arr = self._valid_rows + count = 0 + chunk_size = arr.chunks[0] + pos_true = -1 + + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + if count + actual_size <= key: + count += actual_size + continue + pos_true = chunk_start + (key - count) + break + + chunk_data = arr[chunk_start: chunk_start + actual_size] + n_true = int(np.count_nonzero(chunk_data)) + if count + n_true <= key: + count += n_true + continue + + pos_true = chunk_start + int(np.flatnonzero(chunk_data)[key - count]) + break + + self._raw_col[int(pos_true)] = value + + elif isinstance(key, slice): + real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() + lindices = range(*key.indices(len(real_pos))) + phys_indices = np.array([real_pos[i] for i in lindices], dtype=np.int64) + + if isinstance(value, (list, tuple)): + value = np.array(value, dtype=self._raw_col.dtype) + + self._raw_col[phys_indices] = value + + elif isinstance(key, (list, tuple, np.ndarray)): + real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() + phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64) + + if isinstance(value, (list, tuple)): + value = np.array(value, dtype=self._raw_col.dtype) + + self._raw_col[phys_indices] = value + else: + raise TypeError(f"Invalid index type: {type(key)}") + + def __iter__(self): + arr = self._valid_rows + chunk_size = arr.chunks[0] + + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + + data_chunk = self._raw_col[chunk_start: chunk_start + actual_size] + yield from data_chunk + continue + + mask_chunk = arr[chunk_start: chunk_start + actual_size] + true_offsets = np.flatnonzero(mask_chunk) + + if len(true_offsets) == 0: + continue + + physical_indices = chunk_start + true_offsets + valid_data = self._raw_col[physical_indices.tolist()] + + yield from valid_data + + def __len__(self): + return blosc2.count_nonzero(self._valid_rows) + + def __lt__(self, other): + # < (Less than) + return self._raw_col < other + + def __le__(self, other): + # <= (Less than or equal to) + return self._raw_col <= other + + def __eq__(self, other): + # == (Equal to) + return self._raw_col == other + + def __ne__(self, other): + # != (Not equal to) + return self._raw_col != other + + def __gt__(self, other): + # > (Greater than) + return self._raw_col > other + + def __ge__(self, other): + # >= (Greater than or equal to) + return self._raw_col >= other + + @property + def dtype(self): + return self._raw_col.dtype + + +class CTable(Generic[RowT]): + + + def __init__(self, row_type: type[RowT], new_data = None, expected_size: int = 1_048_576, compact: bool = False) -> None: # noqa: C901 + self._row_type = row_type + self._cols: dict[str, blosc2.NDArray] = {} + self._n_rows: int = 0 + self._col_widths: dict[str, int] = {} + self.col_names = [] + self.row = _RowIndexer(self) + self.auto_compact = compact + self.base = None + + + c, b = compute_chunks_blocks((expected_size,)) + self._valid_rows = blosc2.zeros(shape=(expected_size,), dtype = np.bool_ , chunks=c, blocks=b) + + + for name, field in row_type.model_fields.items(): + self.col_names.append(name) + origin = getattr(field.annotation, "__origin__", field.annotation) + + if origin is str or field.annotation is str: + max_len = 32 # Default MaxLen + if hasattr(field.annotation, "__metadata__"): + for meta in field.annotation.__metadata__: + if isinstance(meta, MaxLen): + max_len = meta.max_length + break + dt = np.dtype(f"U{max_len}") + display_width = max(10, min(max_len, 50)) + + elif origin is bytes or field.annotation is bytes: + max_len = 32 # Default MaxLen + if hasattr(field.annotation, "__metadata__"): + for meta in field.annotation.__metadata__: + if isinstance(meta, MaxLen): + max_len = meta.max_length + break + dt = np.dtype(f"S{max_len}") + display_width = max(10, min(max_len, 50)) + + elif origin is int or field.annotation is int: + dt = np.int64 + display_width = 12 + + elif origin is float or field.annotation is float: + dt = np.float64 + display_width = 15 + + elif origin is bool or field.annotation is bool: + dt = np.bool_ + display_width = 6 # "True" / "False" fit in 5-6 chars + + elif origin is complex or field.annotation is complex: + dt = np.complex128 + display_width = 25 + else: + dt = np.object_ + display_width = 20 + + final_width = max(len(name), display_width) + self._col_widths[name] = final_width # Usefull in __str__ + + self._cols[name] = blosc2.zeros(shape=(expected_size,), dtype=dt, chunks=c, blocks=b) + + if new_data is not None: + is_append = False + + if isinstance(new_data, (np.void, np.record)): + is_append = True + elif isinstance(new_data, np.ndarray): + if new_data.dtype.names is not None and new_data.ndim == 0: + is_append = True + elif isinstance(new_data, list) and len(new_data) > 0: + first_elem = new_data[0] + if isinstance(first_elem, (str, bytes, int, float, bool, complex)): + is_append = True + + if is_append: + self.append(new_data) + else: + self.extend(new_data) + + def __str__(self): + retval = [] + cont = 0 + + for name in self._cols: + retval.append(f"{name:^{self._col_widths[name]}} |") + cont += self._col_widths[name]+2 + retval.append("\n") + for _ in range(cont): + retval.append("-") + retval.append("\n") + + + # We print the rows + + real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + for j in real_poss: + for name in self._cols: + retval.append(f"{self._cols[name][j]:^{self._col_widths[name]}}") + retval.append(" |") + retval.append("\n") + for _ in range(cont): + retval.append("-") + retval.append("\n") + return "".join(retval) + + def __len__(self): + return self._n_rows + + def view(self, new_valid_rows): + if not (isinstance(new_valid_rows, (blosc2.NDArray, blosc2.LazyExpr)) and + (getattr(new_valid_rows, 'dtype', None) == np.bool_)): + raise TypeError(f"Expected boolean blosc2.NDArray or LazyExpr, got {type(new_valid_rows).__name__}") + + new_valid_rows = new_valid_rows.compute() if isinstance(new_valid_rows, blosc2.LazyExpr) else new_valid_rows + + if len(self._valid_rows) != len(new_valid_rows): + raise ValueError() + + retval = CTable(self._row_type, compact=self.auto_compact, expected_size=len(self._valid_rows)) + retval._cols = self._cols + retval._n_rows= blosc2.count_nonzero(new_valid_rows) + retval._col_widths= self._col_widths + retval.col_names = self.col_names + retval.base = self + retval._valid_rows = new_valid_rows + + return retval + + def head(self, N: int = 5) -> CTable: + ''' + # Alternative code, slowe with big data + if n <= 0: + return CTable(self._row_type, compact=self.auto_compact) + + real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + n_take = min(n, self._n_rows) + + retval = CTable(self._row_type, compact=self.auto_compact) + retval._n_rows = n_take + retval._valid_rows[:n_take] = True + + for k in self._cols.keys(): + retval._cols[k][:n_take] = self._cols[k][real_poss[:n_take]] + + return retval''' + if N <= 0: + return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_)) + + arr = self._valid_rows + count = 0 + chunk_size = arr.chunks[0] + pos_N_true = -1 + if (N<=0): + return self.view(blosc2.zeros(shape=len(arr), dtype=np.bool_)) + + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + # All False without decompressing → skip + if info.special == blosc2.SpecialValue.ZERO: + continue + + # Repeated value → check if True or False + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue # all False, skip + # All True: target is at offset (N - count - 1) within the chunk + if count + actual_size < N: + count += actual_size + continue + pos_N_true = chunk_start + (N - count - 1) + break + + # General case: decompress only this chunk + chunk_data = arr[chunk_start: chunk_start + actual_size] + + n_true = int(np.count_nonzero(chunk_data)) + if count + n_true < N: + count += n_true + continue + + # The N-th True is in this chunk + pos_N_true = chunk_start + int(np.flatnonzero(chunk_data)[N - count - 1]) + break + + if pos_N_true == -1: + return self.view(self._valid_rows) + + if pos_N_true < len(self._valid_rows)//2: + mask_arr = blosc2.zeros(shape=len(arr), dtype=np.bool_) + mask_arr[:pos_N_true+1] = True + else: + mask_arr = blosc2.ones(shape=len(arr), dtype=np.bool_) + mask_arr[pos_N_true+1:] = False + + mask_arr = (mask_arr & self._valid_rows).compute() + return self.view(mask_arr) + + def tail(self, N: int = 5) -> CTable: + if N <= 0: + # If N is 0 or negative, return an empty table + return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_)) + + arr = self._valid_rows + count = 0 + chunk_size = arr.chunks[0] + pos_N_true = -1 + + # Convert to list to iterate chunks in reverse order (metadata only, ~0 memory) + for info in reversed(list(arr.iterchunks_info())): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + # All False without decompressing → skip + if info.special == blosc2.SpecialValue.ZERO: + continue + + # Repeated value → check if True or False + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue # all False, skip + + # All True: target is at offset 'actual_size - (N - count)' from chunk start + if count + actual_size < N: + count += actual_size + continue + pos_N_true = chunk_start + actual_size - (N - count) + break + + # General case: decompress only this chunk + chunk_data = arr[chunk_start: chunk_start + actual_size] + + n_true = int(np.count_nonzero(chunk_data)) + if count + n_true < N: + count += n_true + continue + + # The N-th True from the end is in this chunk + # We use negative indexing [-(N - count)] to get elements from the back + pos_N_true = chunk_start + int(np.flatnonzero(chunk_data)[-(N - count)]) + break + + if pos_N_true == -1: + return self.view(self._valid_rows) + + + + + # Mask creation logic reversed: keep everything from pos_N_true to the end + if pos_N_true > len(arr) // 2: + # We keep a small tail (less than half the array): start with zeros + mask_arr = blosc2.zeros(shape=len(arr), dtype=np.bool_) + mask_arr[pos_N_true:] = True + else: + # We keep a large tail (more than half the array): start with ones + mask_arr = blosc2.ones(shape=len(arr), dtype=np.bool_) + if pos_N_true > 0: + mask_arr[:pos_N_true] = False + + # Compute intersection with existing valid rows and creating view + mask_arr = (mask_arr & self._valid_rows).compute() + return self.view(mask_arr) + + def __getitem__(self, s: str): + if s in self._cols: + return Column(self, s) + return None + + def __getattr__(self, s: str): + if s in self._cols: + return Column(self, s) + return super().__getattribute__(s) + + def compact(self): + real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + start = 0 + block_size= self._valid_rows.blocks[0] + end = min(block_size, self._n_rows) + while start < end: + for _, v in self._cols.items(): + v[start:end] = v[real_poss[start:end]] + start += block_size + end = min(end + block_size, self._n_rows) + + self._valid_rows[:self._n_rows] = True + self._valid_rows[self._n_rows:] = False + + @property + def nrows(self) -> int: + return self._n_rows + + @property + def ncols(self) -> int: + return len(self._cols) + + def info(self) -> None: + """ + Prints a concise summary of the CTable, including the column names, + their data types, and memory layout. + """ + n_cols = len(self._cols) + n_rows = len(self) + + # Calculate global memory usage + cbytes = sum(col.cbytes for col in self._cols.values()) + self._valid_rows.cbytes + nbytes = sum(col.nbytes for col in self._cols.values()) + self._valid_rows.nbytes + + def format_bytes(bytes_size: float) -> str: + if bytes_size < 1024: + return f"{bytes_size} B" + elif bytes_size < 1024 ** 2: + return f"{bytes_size / 1024:.2f} KB" + elif bytes_size < 1024 ** 3: + return f"{bytes_size / (1024 ** 2):.2f} MB" + else: + return f"{bytes_size / (1024 ** 3):.2f} GB" + + ratio = (nbytes / cbytes) if cbytes > 0 else 0.0 + + lines = [] + lines.append("") + lines.append(f"nºColumns: {n_cols}") + lines.append(f"nºRows: {n_rows}") + lines.append("") + + # New Header: replaced "Non-Null Count" with internal Array length & Itemsize + header = f" {'#':>3} {'Column':<15} {'Itemsize':<12} {'Dtype':<15}" + lines.append(header) + lines.append(f" {'---':>3} {'------':<15} {'--------':<12} {'-----':<15}") + + for i, name in enumerate(self.col_names): + col_array = self._cols[name] + dtype_str = str(col_array.dtype) + itemsize = f"{col_array.dtype.itemsize} B" + + line = f" {i:>3} {name:<15} {itemsize:<12} {dtype_str:<15}" + lines.append(line) + + lines.append("") + lines.append(f"memory usage: {format_bytes(cbytes)}") + lines.append(f"uncompressed size: {format_bytes(nbytes)}") + lines.append(f"compression ratio: {ratio:.2f}x") + lines.append("") + + print("\n".join(lines)) + + def append(self, data: list | np.void | np.ndarray) -> None: # noqa: C901 + if self.base is not None: + raise TypeError("Cannot extend view.") + + is_list = isinstance(data, (list, tuple)) + col_values = list(self._cols.values()) + col_names = self.col_names + + if isinstance(data, dict): + raise TypeError("Dictionaries are not supported in append.") + + if is_list and len(data) != len(col_values): + raise ValueError(f"Expected {len(col_values)} values, received {len(data)}") + + if is_list: + for i, val in enumerate(data): + target_dtype = col_values[i].dtype + try: + np.array(val, dtype=target_dtype) + except (ValueError, TypeError): + raise TypeError( + f"Value '{val}' is not compatible with column '{col_names[i]}' of type {target_dtype}") from None + else: + for name, arr in self._cols.items(): + try: + val = data[name] + except (IndexError, KeyError, ValueError): + raise ValueError(f"Input data does not contain required field '{name}'") from None + try: + np.array(val, dtype=arr.dtype) + except (ValueError, TypeError): + raise TypeError(f"Value '{val}' in field '{name}' is not compatible with type {arr.dtype}") from None + + ultimas_validas = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + pos = ultimas_validas[-1] + 1 if len(ultimas_validas) > 0 else 0 + if pos >= len(self._valid_rows): + c = len(self._valid_rows) + for _,v in self._cols.items(): + v.resize((c * 2,)) + self._valid_rows.resize((c * 2,)) + + if is_list: + for i, col_array in enumerate(col_values): + col_array[pos] = data[i] + else: + for name, col_array in self._cols.items(): + col_array[pos] = data[name] + self._valid_rows[pos] = True + + self._n_rows += 1 + + def delete(self, ind: int | slice | str | Iterable): + valid_rows_np = self._valid_rows[:] + true_pos = np.where(valid_rows_np)[0] + + if isinstance(ind, Iterable) and not isinstance(ind, (str, bytes)): + ind = list(ind) + elif not isinstance(ind, int) and not isinstance(ind, slice): + raise TypeError(f"Invalid type '{type(ind)}'") + + false_pos = true_pos[ind] + + new_mask_np = valid_rows_np.copy() + new_mask_np[false_pos] = False + + new_mask = blosc2.asarray(new_mask_np) + self._valid_rows = new_mask + self._n_rows = blosc2.count_nonzero(self._valid_rows) + + def extend(self, data: list | CTable | Any) -> None: + if self.base is not None: + raise TypeError("Cannot extend view.") + if len(data) <=0: + return + ultimas_validas = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + start_pos = ultimas_validas[-1] + 1 if len(ultimas_validas) > 0 else 0 + + current_col_names = self.col_names + columns_to_insert = [] + new_nrows = 0 + + if hasattr(data, "_cols") and hasattr(data, "_n_rows"): + for name in current_col_names: + col = data._cols[name][:data._n_rows] + columns_to_insert.append(col) + new_nrows = data._n_rows + else: + if isinstance(data, np.ndarray) and data.dtype.names is not None: + for name in current_col_names: + columns_to_insert.append(data[name]) + new_nrows = len(data) + else: + columns_to_insert = list(zip(*data, strict=True)) + new_nrows = len(data) + + processed_cols = [] + for i, raw_col in enumerate(columns_to_insert): + target_dtype = self._cols[current_col_names[i]].dtype + b2_arr = blosc2.asarray(raw_col, dtype=target_dtype) + processed_cols.append(b2_arr) + + end_pos = start_pos + new_nrows + + if self.auto_compact and end_pos >= len(self._valid_rows): + self.compact() + ultimas_validas = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + start_pos = ultimas_validas[-1] + 1 if len(ultimas_validas) > 0 else 0 + end_pos = start_pos + new_nrows + + while end_pos > len(self._valid_rows): + c = len(self._valid_rows) + for name in current_col_names: + self._cols[name].resize((c*2,)) + self._valid_rows.resize((c*2,)) + + + + + # Do this per chunks + for j, name in enumerate(current_col_names): + self._cols[name][start_pos:end_pos] = processed_cols[j][:] + + self._valid_rows[start_pos:end_pos] = True + self._n_rows = blosc2.count_nonzero(self._valid_rows) + + @profile + def where(self, expr_result) -> CTable: + if not (isinstance(expr_result, (blosc2.NDArray, blosc2.LazyExpr)) and + (getattr(expr_result, 'dtype', None) == np.bool_)): + raise TypeError(f"Expected boolean blosc2.NDArray or LazyExpr, got {type(expr_result).__name__}") + + filter = expr_result.compute() if isinstance(expr_result, blosc2.LazyExpr) else expr_result + + target_len = len(self._valid_rows) + + if len(filter) > target_len: + filter = filter[:target_len] + elif len(filter) < target_len: + padding = blosc2.zeros(target_len, dtype=np.bool_) + padding[:len(filter)] = filter[:] + filter = padding + + filter = (filter & self._valid_rows).compute() + + return self.view(filter) + + def _run_row_logic(self, ind: int | slice | str | Iterable) -> CTable: + valid_rows_np = self._valid_rows[:] + true_pos = np.where(valid_rows_np)[0] + + if isinstance(ind, Iterable) and not isinstance(ind, (str, bytes)): + ind = list(ind) + + mant_pos = true_pos[ind] + + new_mask_np = np.zeros_like(valid_rows_np, dtype=bool) + new_mask_np[mant_pos] = True + + new_mask = blosc2.asarray(new_mask_np) + return self.view(new_mask) + + """Save & load are blank""" + + def save(self, urlpath: str, group: str = "table") -> None: + ... + + @classmethod + def load(cls, urlpath: str, group: str = "table", row_type: type[RowT] | None = None) -> CTable: + ... \ No newline at end of file diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py new file mode 100644 index 00000000..8cbde248 --- /dev/null +++ b/tests/ctable/test_column.py @@ -0,0 +1,390 @@ +from typing import Annotated + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +from blosc2 import CTable + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +def test_column_dtype(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + col_score = tabla.score + col_active = tabla.active + + assert col_id.dtype == np.int64 + assert col_score.dtype == np.float64 + assert col_active.dtype == np.bool_ + + +def test_column_references(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + assert col_id._raw_col is tabla._cols["id"] + assert col_id._valid_rows is tabla._valid_rows + + +def test_column_getitem_int_no_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + print("hola") + assert col_id[0] == 0 + print("hola") + assert col_id[5] == 5 + print("hola") + assert col_id[19] == 19 + print("hola") + assert col_id[-1] == 19 + print("hola") + assert col_id[-5] == 15 + print("hola") + + +def test_column_getitem_int_with_holes(): + data = [(i, float(i * 10), i % 2 == 0) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + col_id = tabla.id + + assert col_id[0] == 0 + assert col_id[1] == 2 + assert col_id[2] == 4 + assert col_id[3] == 6 + assert col_id[4] == 8 + assert col_id[-1] == 19 + assert col_id[-2] == 18 + + +def test_column_getitem_slice_no_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + result = col_id[0:5] + expected = [0, 1, 2, 3, 4] + assert list(result) == expected + + result = col_id[10:15] + expected = [10, 11, 12, 13, 14] + assert list(result) == expected + + result = col_id[::2] + expected = list(range(0, 20, 2)) + assert list(result) == expected + + +def test_column_getitem_slice_with_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) + + col_id = tabla.id + + result = col_id[0:5] + expected = [0, 2, 4, 6, 8] + assert list(result) == expected + + result = col_id[5:10] + expected = [10, 12, 14, 16, 18] + assert list(result) == expected + + result = col_id[::2] + expected = [0, 4, 8, 12, 16] + assert list(result) == expected + + +def test_column_getitem_list_no_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + result = col_id[[0, 5, 10, 15]] + expected = [0, 5, 10, 15] + assert list(result) == expected + + result = col_id[[19, 0, 10]] + expected = [19, 0, 10] + assert list(result) == expected + + +def test_column_getitem_list_with_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + col_id = tabla.id + + result = col_id[[0, 2, 4]] + expected = [0, 4, 8] + assert list(result) == expected + + result = col_id[[5, 3, 1]] + expected = [10, 6, 2] + assert list(result) == expected + + +def test_column_getitem_out_of_range_int(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + col_id = tabla.id + + with pytest.raises(IndexError): + _ = col_id[100] + + with pytest.raises(IndexError): + _ = col_id[-100] + + +def test_column_getitem_out_of_range_list(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + col_id = tabla.id + + with pytest.raises(IndexError): + _ = col_id[[0, 1, 100]] + + +def test_column_setitem_int_no_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + col_id[0] = 999 + assert col_id[0] == 999 + + col_id[10] = 888 + assert col_id[10] == 888 + + col_id[-1] = 777 + assert col_id[-1] == 777 + + +def test_column_setitem_int_with_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + col_id = tabla.id + + col_id[0] = 999 + assert col_id[0] == 999 + assert tabla._cols["id"][0] == 999 + + col_id[2] = 888 + assert col_id[2] == 888 + assert tabla._cols["id"][4] == 888 + + col_id[-1] = 777 + assert col_id[-1] == 777 + + +def test_column_setitem_slice_no_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + col_id[0:5] = [100, 101, 102, 103, 104] + + assert col_id[0] == 100 + assert col_id[1] == 101 + assert col_id[2] == 102 + assert col_id[3] == 103 + assert col_id[4] == 104 + + +def test_column_setitem_slice_with_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + col_id = tabla.id + + col_id[0:3] = [100, 200, 300] + + assert col_id[0] == 100 + assert col_id[1] == 200 + assert col_id[2] == 300 + + +def test_column_setitem_list_no_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + col_id[[0, 5, 10]] = [100, 500, 1000] + + assert col_id[0] == 100 + assert col_id[5] == 500 + assert col_id[10] == 1000 + + +def test_column_setitem_list_with_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + col_id = tabla.id + + col_id[[0, 2, 4]] = [100, 200, 300] + + assert col_id[0] == 100 + assert col_id[2] == 200 + assert col_id[4] == 300 + + +def test_column_iter_no_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + result = list(col_id) + expected = list(range(20)) + + assert result == expected + + +def test_column_iter_with_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) + + col_id = tabla.id + + result = list(col_id) + expected = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] + + assert result == expected + + +def test_column_iter_score(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([0, 5, 10, 15]) + + col_score = tabla.score + + result = list(col_score) + expected = [10.0, 20.0, 30.0, 40.0, 60.0, 70.0, 80.0, 90.0, + 110.0, 120.0, 130.0, 140.0, 160.0, 170.0, 180.0, 190.0] + + assert result == expected + + +def test_column_len_no_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + assert len(col_id) == 20 + + +def test_column_len_with_holes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + col_id = tabla.id + + assert len(col_id) == 15 + + +def test_column_len_after_multiple_deletes(): + data = [(i, float(i * 10), True) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + col_id = tabla.id + + assert len(col_id) == 20 + + tabla.delete([0, 1, 2]) + assert len(col_id) == 17 + + tabla.delete([0, 1, 2, 3, 4]) + assert len(col_id) == 12 + + +def test_column_multiple_columns_consistency(): + data = [(i, float(i * 10), i % 2 == 0) for i in range(20)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete([2, 5, 8, 11, 14]) + + col_id = tabla.id + col_score = tabla.score + col_active = tabla.active + + assert len(col_id) == len(col_score) == len(col_active) == 15 + + for i in range(len(col_id)): + expected_id = col_id[i] + expected_score = col_score[i] + expected_active = col_active[i] + + assert expected_score == float(expected_id * 10) + + +def test_column_empty_table(): + tabla = CTable(RowModel) + + col_id = tabla.id + + assert len(col_id) == 0 + + result = list(col_id) + assert result == [] + + +def test_column_all_deleted(): + data = [(i, float(i * 10), True) for i in range(10)] + tabla = CTable(RowModel, new_data=data) + + tabla.delete(list(range(10))) + + col_id = tabla.id + + assert len(col_id) == 0 + + result = list(col_id) + assert result == [] diff --git a/tests/ctable/test_compact.py b/tests/ctable/test_compact.py new file mode 100644 index 00000000..30d0dd53 --- /dev/null +++ b/tests/ctable/test_compact.py @@ -0,0 +1,151 @@ +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +from blosc2 import CTable + + +# --- Basic model setup for tests --- +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + + +def generate_test_data(n_rows: int) -> list: + return [(i, float(i)) for i in range(n_rows)] + + +def test_compact_empty_table(): + """Test compact() on a completely empty table (no data).""" + table = CTable(RowModel, expected_size=100) + initial_capacity = len(table._valid_rows) + + assert len(table) == 0 + + # Should not raise any error + table.compact() + + # Capacity might have drastically reduced, but the logical table must remain empty + assert len(table) == 0 + # Verify that if data is added later, it works correctly + table.append((1, 10.0)) + assert len(table) == 1 + assert table.id[0] == 1 + + +def test_compact_full_table(): + """Test compact() on a completely full table (no holes or free space).""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + assert len(table) == 50 + initial_capacity = len(table._valid_rows) + + # Should not raise any error or change the logical state + table.compact() + + assert len(table) == 50 + # Capacity should not have changed because it was already full + assert len(table._valid_rows) == initial_capacity + + # Verify data integrity + assert table.id[0] == 0 + assert table.id[-1] == 49 + + +def test_compact_already_compacted_table(): + """Test compact() on a table that has free space but no holes (contiguous data).""" + data = generate_test_data(20) + # Large expected_size to ensure free space at the end + table = CTable(RowModel, new_data=data, expected_size=100) + + assert len(table) == 20 + + # Execute compact. Since data is already contiguous, the table might reduce + # its size due to the < len//2 while loop, but it shouldn't fail. + table.compact() + + assert len(table) == 20 + + # Verify that data remains in place + for i in range(20): + assert table.id[i] == i + + # Validate that all True values are consecutive at the beginning + mask = table._valid_rows[:len(table._valid_rows)] + assert np.all(mask[:20]) + if len(mask) > 20: + assert np.all(~mask[20:]) + + +def test_compact_with_holes(): + """Test compact() on a table with high fragmentation (holes).""" + data = generate_test_data(30) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete sparsely: leave only [0, 5, 10, 15, 20, 25] + to_delete = [i for i in range(30) if i % 5 != 0] + table.delete(to_delete) + + assert len(table) == 6 + + # Execute compact + table.compact() + + assert len(table) == 6 + + # Verify that the correct data survived and moved to the beginning + expected_ids = [0, 5, 10, 15, 20, 25] + for i, exp_id in enumerate(expected_ids): + # Through the logical view (Column wrapper) + assert table.id[i] == exp_id + # Through the physical blosc2 array (to ensure compact worked) + assert table._cols["id"][i] == exp_id + + # Verify physical mask: first 6 must be True, the rest False + mask = table._valid_rows[:len(table._valid_rows)] + assert np.all(mask[:6]) + if len(mask) > 6: + assert np.all(~mask[6:]) + + +def test_compact_all_deleted(): + """Test compact() on a table where absolutely all rows have been deleted.""" + data = generate_test_data(20) + table = CTable(RowModel, new_data=data, expected_size=20) + + # Delete everything + table.delete(list(range(20))) + assert len(table) == 0 + + # Should handle empty arrays correctly + table.compact() + + assert len(table) == 0 + + # Check that we can write to it again + table.append((99, 99.0)) + assert len(table) == 1 + assert table.id[0] == 99 + + +def test_compact_multiple_times(): + """Calling compact() multiple times in a row must not corrupt data or crash.""" + data = generate_test_data(10) + table = CTable(RowModel, new_data=data, expected_size=20) + + table.delete([1, 3, 5, 7, 9]) # 5 elements remaining + + # Compact 3 times in a row + table.compact() + table.compact() + table.compact() + + assert len(table) == 5 + assert list(table.id) == [0, 2, 4, 6, 8] diff --git a/tests/ctable/test_construct.py b/tests/ctable/test_construct.py new file mode 100644 index 00000000..6b9599e1 --- /dev/null +++ b/tests/ctable/test_construct.py @@ -0,0 +1,113 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated, TypeVar + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +from blosc2 import CTable + +# ------------------------------------------------------------------- +# 1. Row Type Definition for Testing +# ------------------------------------------------------------------- +RowT = TypeVar("RowT", bound=BaseModel) + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +# ------------------------------------------------------------------- +# 2. Predefined Test Data (solo lo mínimo) +# ------------------------------------------------------------------- +SMALL_DATA = [ + (1, 1 + 2j, 95.5, True), + (2, 3 - 4j, 80.0, False), + (3, 0j, 50.2, True), +] + +dtype_struct = [('id', 'i8'), ('c_val', 'c16'), ('score', 'f8'), ('active', '?')] +SMALL_STRUCT = np.array(SMALL_DATA, dtype=dtype_struct) + + +# ------------------------------------------------------------------- +# 3. LOS 3 TESTS DE EXTEND +# ------------------------------------------------------------------- + +def test_extend_from_list(): + """Extend con lista de tuplas.""" + table = CTable(RowModel) + table.extend(SMALL_DATA) + + assert len(table) == 3 + assert table.id[0] == 1 + assert table.id[2] == 3 + + +def test_extend_from_struct(): + """Extend con structured array.""" + table = CTable(RowModel) + table.extend(SMALL_STRUCT) + + assert len(table) == 3 + assert table.id[0] == 1 + assert table.score[1] == 80.0 + + + +def test_extend_from_another_ctable(): + """Extend con otra CTable.""" + base_table = CTable(RowModel, new_data=SMALL_DATA) + new_table = CTable(RowModel) + new_table.extend(base_table) + assert len(new_table) == 3 + +def test_extend_empty_list(): + """Extend con lista vacía no debe romper.""" + table = CTable(RowModel) + table.extend([]) + assert len(table) == 0 + +def test_extend_multiple_times(): + """Múltiples extends consecutivos.""" + table = CTable(RowModel) + table.extend(SMALL_DATA[:2]) + table.extend(SMALL_DATA[2:]) + assert len(table) == 3 + +def test_extend_with_auto_resize(): + """Extend que fuerza auto-resize.""" + table = CTable(RowModel, expected_size=1) + table.extend(SMALL_DATA) + assert len(table) == 3 + +def test_extend_invalid_length(): + """Extend con número incorrecto de campos.""" + table = CTable(RowModel) + with pytest.raises(IndexError): + table.extend([(1, 2+3j)]) # Faltan campos + +def test_extend_invalid_type(): + """Extend con tipo incompatible.""" + table = CTable(RowModel) + with pytest.raises((TypeError, ValueError)): + table.extend([(1, "texto", 50.0, True)]) + + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_delete_rows.py b/tests/ctable/test_delete_rows.py new file mode 100644 index 00000000..25b60387 --- /dev/null +++ b/tests/ctable/test_delete_rows.py @@ -0,0 +1,529 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated, TypeVar + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +from blosc2 import CTable + +# NOTE: Make sure to import your CTable and NumpyDtype correctly + +# ------------------------------------------------------------------- +# 1. Row Type Definition for Testing +# ------------------------------------------------------------------- +RowT = TypeVar("RowT", bound=BaseModel) + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +# ------------------------------------------------------------------- +# 2. Test Data Generation +# ------------------------------------------------------------------- +def generate_test_data(n_rows: int) -> list: + """ + Generate n_rows of test data following the RowModel schema. + Returns a list of tuples. + """ + return [ + (i, complex(i, -i), float((i * 7) % 100), bool(i % 2)) + for i in range(1, n_rows + 1) + ] + + +# ------------------------------------------------------------------- +# 3. Helper Functions +# ------------------------------------------------------------------- +def get_valid_positions(table: CTable) -> np.ndarray: + """ + Extract the positions where _valid_rows is True. + Returns a numpy array of indices. + """ + return np.flatnonzero(table._valid_rows[:len(table._valid_rows)]) + + +def assert_valid_rows_match(table: CTable, expected_valid_indices: list): + """ + Check that _valid_rows has True exactly at the expected positions + and False everywhere else (up to the table's internal array length). + + Args: + table: The CTable instance to check + expected_valid_indices: List of indices that should be True + """ + valid_positions = get_valid_positions(table) + expected_array = np.array(sorted(expected_valid_indices)) + + np.testing.assert_array_equal( + valid_positions[:len(expected_array)], + expected_array, + err_msg=f"Valid rows mismatch. Expected {expected_array}, got {valid_positions}" + ) + + +# ------------------------------------------------------------------- +# 4. Basic Delete Tests (Single Element) +# ------------------------------------------------------------------- + +def test_delete_first_element_once(): + """Delete the first element (position 0) from a full 50-row table.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Before deletion + assert len(table) == 50 + + # Delete first element + table.delete(0) + + # After deletion + assert len(table) == 49 + # Position 0 should now be False, positions 1-49 should be True + expected_valid = list(range(1, 50)) + assert_valid_rows_match(table, expected_valid) + + +def test_delete_first_element_10_times(): + """Delete the first element 10 times consecutively using a loop.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + initial_length = 50 + + for i in range(10): + table.delete(0) + expected_length = initial_length - (i + 1) + assert len(table) == expected_length, \ + f"After {i + 1} deletions, expected length {expected_length}, got {len(table)}" + + # After 10 deletions, should have 40 rows + assert len(table) == 40 + + +def test_delete_last_element_once(): + """Delete the last element using delete(-1) from a full 50-row table.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Before deletion + assert len(table) == 50 + + # Delete last element + table.delete(-1) + + # After deletion + assert len(table) == 49 + + +def test_delete_last_element_10_times(): + """Delete the last element 10 times consecutively using delete(-1).""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + initial_length = 50 + + for i in range(10): + table.delete(-1) + expected_length = initial_length - (i + 1) + assert len(table) == expected_length, \ + f"After {i + 1} deletions, expected length {expected_length}, got {len(table)}" + + # After 10 deletions, should have 40 rows + assert len(table) == 40 + + +def test_delete_middle_element(): + """Delete a middle element from a 50-row table.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete position 25 (middle) + table.delete(25) + + assert len(table) == 49 + + +def test_delete_multiple_individual_elements(): + """Delete multiple non-consecutive elements one by one.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete positions 5, 15, 25, 35, 45 + positions_to_delete = [5, 15, 25, 35, 45] + + for _ in positions_to_delete: + # Adjust position because previous deletions shift indices + table.delete(0) # Simplified: delete first element 5 times + + assert len(table) == 45 + + +# ------------------------------------------------------------------- +# 5. Delete with List of Positions +# ------------------------------------------------------------------- + +def test_delete_list_of_positions(): + """Delete multiple positions at once using a list.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete positions [0, 10, 20, 30, 40] + table.delete([0, 10, 20, 30, 40]) + + assert len(table) == 45 + + +def test_delete_consecutive_positions_list(): + """Delete consecutive positions using a list.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete positions [5, 6, 7, 8, 9] + table.delete([5, 6, 7, 8, 9]) + + assert len(table) == 45 + + +def test_delete_all_even_positions(): + """Delete all even-indexed positions.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete all even positions (0, 2, 4, ..., 48) + even_positions = list(range(0, 50, 2)) + table.delete(even_positions) + + assert len(table) == 25 + + +def test_delete_all_odd_positions(): + """Delete all odd-indexed positions.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete all odd positions (1, 3, 5, ..., 49) + odd_positions = list(range(1, 50, 2)) + table.delete(odd_positions) + + assert len(table) == 25 + + +# ------------------------------------------------------------------- +# 6. Delete Out-of-Bounds Tests (Should Raise Errors) +# ------------------------------------------------------------------- + +def test_delete_position_beyond_length_full_table(): + """ + Try to delete position 60 in a full 50-row table. + Should raise IndexError. + """ + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + with pytest.raises(IndexError): + table.delete(60) + + +def test_delete_position_beyond_nrows_partial_table(): + """ + Try to delete position 35 in a table with capacity 50 but only 25 rows. + Should raise IndexError. + """ + data = generate_test_data(25) + table = CTable(RowModel, new_data=data, expected_size=50) + + assert len(table) == 25 + + with pytest.raises(IndexError): + table.delete(35) + + +def test_delete_from_empty_table_position_25(): + """ + Try to delete position 25 from an empty table. + Should raise IndexError. + """ + table = CTable(RowModel, expected_size=50) + + assert len(table) == 0 + + with pytest.raises(IndexError): + table.delete(25) + + +def test_delete_from_empty_table_position_0(): + """ + Try to delete position 0 from an empty table. + Should raise IndexError. + """ + table = CTable(RowModel, expected_size=50) + + assert len(table) == 0 + + with pytest.raises(IndexError): + table.delete(0) + + +def test_delete_from_empty_table_position_negative(): + """ + Try to delete position -1 from an empty table. + Should raise IndexError. + """ + table = CTable(RowModel, expected_size=50) + + assert len(table) == 0 + + with pytest.raises(IndexError): + table.delete(-1) + + +def test_delete_negative_position_beyond_length(): + """ + Try to delete position -60 in a 50-row table. + Should raise IndexError. + """ + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + with pytest.raises(IndexError): + table.delete(-60) + + +# ------------------------------------------------------------------- +# 7. Delete with Slices (if your implementation supports it) +# ------------------------------------------------------------------- +# NOTE: Based on your current code, delete() accepts int or list[int]. +# If you want to support slices, you'll need to modify your delete method. +# Below are tests assuming slice support is added. + +def test_delete_slice_range_a_to_b(): + """ + Delete rows from position a to b (not including b) using slice(a, b). + Example: delete positions 10 to 20 (10 rows). + + NOTE: This requires your delete() method to handle slice objects. + """ + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # This will only work if you implement slice support in delete() + try: + # Delete positions 10-19 (10 rows) + positions = list(range(10, 20)) + table.delete(positions) + + assert len(table) == 40 + except TypeError: + pytest.skip("Slice support not yet implemented in delete()") + + +def test_delete_slice_with_step(): + """ + Delete rows using slice with step: a:b:c + Example: delete every other row from 0 to 20. + + NOTE: This requires your delete() method to handle slice objects. + """ + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + try: + # Delete positions 0, 2, 4, ..., 18 (every other row from 0 to 20) + positions = list(range(0, 20, 2)) + table.delete(positions) + + assert len(table) == 40 + except TypeError: + pytest.skip("Slice support not yet implemented in delete()") + + +def test_delete_slice_from_start(): + """ + Delete rows from start to position b using slice(:b). + Example: delete first 10 rows. + + NOTE: This requires your delete() method to handle slice objects. + """ + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + try: + # Delete positions 0-9 (first 10 rows) + positions = list(range(0, 10)) + table.delete(positions) + + assert len(table) == 40 + except TypeError: + pytest.skip("Slice support not yet implemented in delete()") + + +def test_delete_slice_to_end(): + """ + Delete rows from position a to end using slice(a:). + Example: delete last 10 rows. + + NOTE: This requires your delete() method to handle slice objects. + """ + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + try: + # Delete positions 40-49 (last 10 rows) + positions = list(range(40, 50)) + table.delete(positions) + + assert len(table) == 40 + except TypeError: + pytest.skip("Slice support not yet implemented in delete()") + + +# ------------------------------------------------------------------- +# 8. Edge Cases and Special Scenarios +# ------------------------------------------------------------------- + +def test_delete_same_position_twice(): + """ + Try to delete the same logical position twice. + The second deletion should fail or behave correctly. + """ + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete position 10 + table.delete(10) + assert len(table) == 49 + + # Try to delete what is now position 10 (was position 11 before) + table.delete(10) + assert len(table) == 48 + + +def test_delete_all_rows_one_by_one(): + """Delete all 50 rows one by one from the front.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + for _ in range(50): + table.delete(0) + + assert len(table) == 0 + + +def test_delete_all_rows_from_back(): + """Delete all 50 rows one by one from the back using -1.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + for _ in range(50): + table.delete(-1) + + assert len(table) == 0 + + +def test_delete_with_negative_indices(): + """Delete using various negative indices.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete positions -1, -5, -10 (last, 5th from last, 10th from last) + table.delete([-1, -5, -10]) + + assert len(table) == 47 + + +def test_delete_mixed_positive_negative_indices(): + """Delete using a mix of positive and negative indices.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete positions [0, -1, 25] (first, last, middle) + table.delete([0, -1, 25]) + + assert len(table) == 47 + + +# ------------------------------------------------------------------- +# 9. Type Validation Tests +# ------------------------------------------------------------------- + +def test_delete_invalid_type_string(): + """Try to delete with a string (invalid type). Should raise TypeError.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + with pytest.raises(TypeError): + table.delete("invalid") + + +def test_delete_invalid_type_float(): + """Try to delete with a float (invalid type). Should raise TypeError.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + with pytest.raises(TypeError): + table.delete(10.5) + + +def test_delete_invalid_list_with_strings(): + """Try to delete with a list containing strings. Should raise TypeError.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + with pytest.raises(IndexError): + table.delete([0, "invalid", 10]) + + +# ------------------------------------------------------------------- +# 10. Stress Tests +# ------------------------------------------------------------------- + +def test_delete_large_number_of_positions(): + """Delete a large number of positions at once.""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete 40 out of 50 positions + positions_to_delete = list(range(0, 40)) + table.delete(positions_to_delete) + + assert len(table) == 10 + + +def test_delete_alternate_pattern(): + """ + Delete alternating rows multiple times to test + the _valid_rows tracking under complex patterns. + """ + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + # First pass: delete every other row (even indices) + even_positions = list(range(0, 50, 2)) + table.delete(even_positions) + assert len(table) == 25 + + # Second pass: delete every other remaining row + # (which are at logical positions 0, 2, 4, ... in the new 25-row table) + new_even = list(range(0, 25, 2)) + table.delete(new_even) + assert len(table) == 12 # Roughly half of 25 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_extend_delete.py b/tests/ctable/test_extend_delete.py new file mode 100644 index 00000000..02f3feea --- /dev/null +++ b/tests/ctable/test_extend_delete.py @@ -0,0 +1,330 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated, TypeVar + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +from blosc2 import CTable + +RowT = TypeVar("RowT", bound=BaseModel) + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +def generate_test_data(n_rows: int, start_id: int = 1) -> list: + return [ + (start_id + i, complex(i, -i), float((i * 7) % 100), bool(i % 2)) + for i in range(n_rows) + ] + + +def get_valid_mask(table: CTable) -> np.ndarray: + return np.array(table._valid_rows[:len(table._valid_rows)], dtype=bool) + + +def get_column_values(table: CTable, col_name: str, length: int) -> np.ndarray: + return np.array(table._cols[col_name][:length]) + + +def assert_mask_matches(table: CTable, expected_mask: list): + actual_mask = get_valid_mask(table)[:len(expected_mask)] + expected = np.array(expected_mask, dtype=bool) + + np.testing.assert_array_equal( + actual_mask, expected, + err_msg=f"Mask mismatch.\nExpected: {expected}\nGot: {actual_mask}" + ) + + +def assert_data_at_positions(table: CTable, positions: list, expected_ids: list): + id_col = table.id + for pos, expected_id in zip(positions, expected_ids, strict=True): + actual_id = int(table._cols["id"][pos]) + assert actual_id == expected_id, \ + f"Position {pos}: expected ID {expected_id}, got {actual_id}" + + +def test_insert_after_delete_fills_last_gap(): + data_c1 = generate_test_data(7, start_id=1) + table = CTable(RowModel, new_data=data_c1, expected_size=10) + + table.delete([0, 2, 4, 6]) + + expected_mask_after_delete = [False, True, False, True, False, True, False] + assert_mask_matches(table, expected_mask_after_delete) + assert len(table) == 3 + + data_c2 = generate_test_data(3, start_id=8) + table.extend(data_c2) + + expected_mask_final = [False, True, False, True, False, True, True, True, True] + assert_mask_matches(table, expected_mask_final) + assert len(table) == 6 + + assert_data_at_positions(table, [6, 7, 8], [8, 9, 10]) + + +def test_append_single_row_fills_gap(): + data = generate_test_data(5, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=10) + + table.delete([1, 3]) + + expected_mask = [True, False, True, False, True] + assert_mask_matches(table, expected_mask) + + table.append((6, 1j, 50.0, True)) + + expected_mask_after = [True, False, True, False, True, True] + assert_mask_matches(table, expected_mask_after) + + table.append((7, 2j, 60.0, False)) + + expected_mask_final = [True, False, True, False, True, True, True] + assert_mask_matches(table, expected_mask_final) + + +def test_resize_when_capacity_full_with_gaps(): + data = generate_test_data(10, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=10, compact=False) + + table.delete(list(range(9))) + + assert len(table) == 1 + + initial_capacity = len(table._valid_rows) + + table.append((11, 5j, 75.0, True)) + + new_capacity = len(table._valid_rows) + assert new_capacity > initial_capacity, \ + f"Expected resize, but capacity stayed {initial_capacity}" + + +def test_no_resize_with_compact_enabled(): + data = generate_test_data(10, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=10, compact=True) + + table.delete(list(range(9))) + + assert len(table) == 1 + + initial_capacity = len(table._valid_rows) + + new_data = generate_test_data(3, start_id=11) + table.extend(new_data) + + new_capacity = len(table._valid_rows) + assert new_capacity <= initial_capacity * 2, \ + "Unexpected massive resize with auto_compact enabled" + + +def test_resize_when_extend_exceeds_capacity(): + data = generate_test_data(5, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=10, compact=False) + + table.delete([0, 2, 4]) + + initial_capacity = len(table._valid_rows) + + large_data = generate_test_data(20, start_id=100) + table.extend(large_data) + + new_capacity = len(table._valid_rows) + assert new_capacity > initial_capacity + + +def test_extend_fills_from_last_valid_position(): + data = generate_test_data(10, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=15) + + table.delete([2, 4, 6]) + + new_data = generate_test_data(3, start_id=20) + table.extend(new_data) + + assert_data_at_positions(table, [10, 11, 12], [20, 21, 22]) + + +def test_multiple_extends_with_gaps(): + data = generate_test_data(5, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=20) + + table.extend(generate_test_data(3, start_id=10)) + assert len(table) == 8 + + table.delete([2, 4, 6]) + assert len(table) == 5 + + table.extend(generate_test_data(2, start_id=20)) + assert len(table) == 7 + + table.delete([0, 1]) + assert len(table) == 5 + + table.extend(generate_test_data(4, start_id=30)) + assert len(table) == 9 + + +def test_append_and_extend_mixed_with_gaps(): + table = CTable(RowModel, expected_size=20) + + for i in range(5): + table.append((i + 1, complex(i), float(i * 10), True)) + + assert len(table) == 5 + + table.extend(generate_test_data(5, start_id=10)) + assert len(table) == 10 + + table.delete([1, 3, 5, 7, 9]) + assert len(table) == 5 + + table.append((100, 0j, 50.0, False)) + assert len(table) == 6 + + table.extend(generate_test_data(3, start_id=200)) + assert len(table) == 9 + + +def test_fill_gaps_completely_then_extend(): + data = generate_test_data(10, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=15) + + table.delete(list(range(0, 10, 2))) + assert len(table) == 5 + + table.extend(generate_test_data(5, start_id=20)) + assert len(table) == 10 + + +def test_delete_all_then_extend(): + data = generate_test_data(10, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=15) + + table.delete(list(range(10))) + assert len(table) == 0 + + new_data = generate_test_data(5, start_id=100) + table.extend(new_data) + + assert len(table) == 5 + + +def test_sparse_table_with_many_gaps(): + data = generate_test_data(20, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=30) + + to_delete = [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18] + table.delete(to_delete) + + assert len(table) == 5 + + table.extend(generate_test_data(10, start_id=100)) + + assert len(table) == 15 + + +def test_alternating_insert_delete_pattern(): + table = CTable(RowModel, expected_size=50) + + for cycle in range(5): + table.extend(generate_test_data(10, start_id=cycle * 100)) + + current_len = len(table) + if current_len >= 5: + to_delete = list(range(0, min(5, current_len))) + table.delete(to_delete) + + +def test_manual_compact_before_extend(): + data = generate_test_data(10, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=15, compact=False) + + table.delete([1, 3, 5, 7, 9]) + assert len(table) == 5 + + table.compact() + + expected_mask = [True] * 5 + [False] * 10 + assert_mask_matches(table, expected_mask) + + table.extend(generate_test_data(3, start_id=20)) + assert len(table) == 8 + + +def test_auto_compact_on_extend(): + data = generate_test_data(10, start_id=1) + table = CTable(RowModel, new_data=data, expected_size=15, compact=True) + + table.delete(list(range(0, 8))) + assert len(table) == 2 + + table.extend(generate_test_data(10, start_id=100)) + + assert len(table) == 12 + + +def test_data_integrity_after_gap_operations(): + data1 = [(1, 1j, 10.0, True), (2, 2j, 20.0, False), (3, 3j, 30.0, True)] + table = CTable(RowModel, new_data=data1, expected_size=10) + + table.delete(1) + + assert table.row[0].id[0] == 1 + assert table.row[1].id[0] == 3 + + data2 = [(10, 10j, 100.0, True), (11, 11j, 110.0, False)] + table.extend(data2) + + assert table.row[0].id[0] == 1 + assert table.row[1].id[0] == 3 + assert table.row[2].id[0] == 10 + assert table.row[3].id[0] == 11 + + +def test_complex_scenario_full_workflow(): + table = CTable(RowModel, expected_size=20, compact=False) + + table.extend(generate_test_data(10, start_id=1)) + assert len(table) == 10 + + table.delete([0, 2, 4, 6, 8]) + assert len(table) == 5 + + table.append((100, 0j, 50.0, True)) + table.append((101, 1j, 60.0, False)) + assert len(table) == 7 + + table.extend(generate_test_data(5, start_id=200)) + assert len(table) == 12 + + table.delete([3, 7, 10]) + assert len(table) == 9 + + table.extend(generate_test_data(3, start_id=300)) + assert len(table) == 12 + + assert table.nrows == 12 + assert table.ncols == 4 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_row_logic.py b/tests/ctable/test_row_logic.py new file mode 100644 index 00000000..98e7d6fa --- /dev/null +++ b/tests/ctable/test_row_logic.py @@ -0,0 +1,387 @@ +from typing import Annotated + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +from blosc2 import CTable +from blosc2.ctable import Column + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +def generate_test_data(n_rows: int, start_id: int = 0) -> list: + return [(start_id + i, float(i * 10), i % 2 == 0) for i in range(n_rows)] + + +def test_row_int_no_holes(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[0] + + assert isinstance(result, CTable) + assert len(result) == 1 + assert result.id[0] == 0 + assert result.score[0] == 0.0 + assert result.active[0] + + result = tabla.row[10] + assert len(result) == 1 + assert result.id[0] == 10 + assert result.score[0] == 100.0 + + +def test_row_int_with_holes(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + result = tabla.row[0] + assert len(result) == 1 + assert result.id[0] == 0 + + result = tabla.row[1] + assert len(result) == 1 + assert result.id[0] == 2 + + result = tabla.row[5] + assert len(result) == 1 + assert result.id[0] == 10 + + +def test_row_int_negative_indices(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[-1] + assert len(result) == 1 + assert result.id[0] == 19 + + result = tabla.row[-5] + assert len(result) == 1 + assert result.id[0] == 15 + + +def test_row_int_out_of_range(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + with pytest.raises(IndexError): + _ = tabla.row[10] + + with pytest.raises(IndexError): + _ = tabla.row[100] + + with pytest.raises(IndexError): + _ = tabla.row[-11] + + +def test_row_slice_no_holes(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[0:5] + + assert isinstance(result, CTable) + assert len(result) == 5 + assert list(result.id) == [0, 1, 2, 3, 4] + + result = tabla.row[10:15] + assert len(result) == 5 + assert list(result.id) == [10, 11, 12, 13, 14] + + result = tabla.row[::2] + assert len(result) == 10 + assert list(result.id) == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] + + +def test_row_slice_with_holes(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + result = tabla.row[0:5] + assert len(result) == 5 + assert list(result.id) == [0, 2, 4, 6, 8] + + result = tabla.row[5:10] + assert len(result) == 5 + assert list(result.id) == [10, 11, 12, 13, 14] + + +def test_row_slice_beyond_table_size(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[11:20] + assert len(result) == 0 + + result = tabla.row[5:100] + assert len(result) == 5 + assert list(result.id) == [5, 6, 7, 8, 9] + + result = tabla.row[100:] + assert len(result) == 0 + + +def test_row_slice_negative_indices(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[-5:] + assert len(result) == 5 + assert list(result.id) == [15, 16, 17, 18, 19] + + result = tabla.row[-10:-5] + assert len(result) == 5 + assert list(result.id) == [10, 11, 12, 13, 14] + + +def test_row_list_no_holes(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[[0, 5, 10, 15]] + + assert isinstance(result, CTable) + assert len(result) == 4 + assert set(result.id) == {0, 5, 10, 15} + + result = tabla.row[[19, 0, 10]] + assert len(result) == 3 + assert set(result.id) == {0, 10, 19} + + +def test_row_list_with_holes(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + tabla.delete([1, 3, 5, 7, 9]) + + result = tabla.row[[0, 2, 4]] + assert len(result) == 3 + assert set(result.id) == {0, 4, 8} + + result = tabla.row[[5, 3, 1]] + assert len(result) == 3 + assert set(result.id) == {2, 6, 10} + + +def test_row_list_out_of_range(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + with pytest.raises(IndexError): + _ = tabla.row[[0, 5, 100]] + + with pytest.raises(IndexError): + _ = tabla.row[[0, 1, -11]] + + +def test_row_returns_view_properties(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[0:10] + + assert result.base is tabla + assert result._row_type == tabla._row_type + assert result._cols is tabla._cols + assert result._col_widths == tabla._col_widths + assert result.col_names == tabla.col_names + + +def test_row_chained_views(): + data = generate_test_data(100) + tabla0 = CTable(RowModel, new_data=data) + + tabla1 = tabla0.row[:50] + assert tabla1.base is tabla0 + assert len(tabla1) == 50 + assert list(tabla1.id)[:5] == [0, 1, 2, 3, 4] + + tabla2 = tabla1.row[:10] + assert tabla2.base is tabla1 + assert len(tabla2) == 10 + assert list(tabla2.id) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + tabla3 = tabla2.row[5:] + assert tabla3.base is tabla2 + assert len(tabla3) == 5 + assert list(tabla3.id) == [5, 6, 7, 8, 9] + + +def test_row_view_on_view_with_holes(): + data = generate_test_data(50) + tabla0 = CTable(RowModel, new_data=data) + + tabla0.delete([5, 10, 15, 20, 25]) + + tabla1 = tabla0.row[:30] + assert tabla1.base is tabla0 + assert len(tabla1) == 30 + + tabla2 = tabla1.row[10:20] + assert tabla2.base is tabla1 + assert len(tabla2) == 10 + + +def test_row_empty_slice(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[5:5] + assert len(result) == 0 + + result = tabla.row[0:0] + assert len(result) == 0 + + +def test_row_full_slice(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[:] + assert len(result) == 10 + assert list(result.id) == list(range(10)) + + +def test_row_empty_table(): + tabla = CTable(RowModel) + + with pytest.raises(IndexError): + _ = tabla.row[0] + + result = tabla.row[:] + assert len(result) == 0 + + result = tabla.row[0:10] + assert len(result) == 0 + + +def test_row_all_deleted(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + tabla.delete(list(range(10))) + + with pytest.raises(IndexError): + _ = tabla.row[0] + + result = tabla.row[:] + assert len(result) == 0 + + +def test_row_view_maintains_mask_reference(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[5:15] + + mask = result._valid_rows[:] + true_count = np.count_nonzero(mask) + assert true_count == 10 + + +def test_row_single_element_list(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[[5]] + assert len(result) == 1 + assert result.id[0] == 5 + + +def test_row_duplicate_indices_in_list(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[[5, 5, 5]] + assert len(result) == 1 + assert result.id[0] == 5 + + +def test_row_view_base_chain(): + data = generate_test_data(100) + tabla0 = CTable(RowModel, new_data=data) + + assert tabla0.base is None + + tabla1 = tabla0.row[:80] + assert tabla1.base is tabla0 + + tabla2 = tabla1.row[:60] + assert tabla2.base is tabla1 + + tabla3 = tabla2.row[:40] + assert tabla3.base is tabla2 + + +def test_row_view_read_operations(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + view = tabla.row[5:15] + + assert view.id[0] == 5 + assert view.score[0] == 50.0 + assert not view.active[0] + + assert list(view.id) == list(range(5, 15)) + + +def test_row_list_empty(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[[]] + assert len(result) == 0 + + +def test_row_slice_with_step(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[0:10:2] + assert len(result) == 5 + assert list(result.id) == [0, 2, 4, 6, 8] + + result = tabla.row[1:10:3] + assert len(result) == 3 + assert list(result.id) == [1, 4, 7] + + +def test_row_list_with_negative_indices(): + data = generate_test_data(10) + tabla = CTable(RowModel, new_data=data) + + result = tabla.row[[0, -1, 5]] + assert len(result) == 3 + assert set(result.id) == {0, 5, 9} + + +def test_row_view_columns_are_live(): + data = generate_test_data(20) + tabla = CTable(RowModel, new_data=data) + + view = tabla.row[5:10] + + col = view.id + assert isinstance(col, Column) if 'Column' in dir() else True + assert col._table is view + + +if __name__ == "__main__": + pytest.main(["-v", __file__])