Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,33 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [0.17.12](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.12) - 2026-02-23

### Added
- `Dataset.deduplicate()` method to deduplicate images using perceptual hashing. Accepts optional `reference_ids` to deduplicate specific items, or deduplicates the entire dataset when only `threshold` is provided. Required `threshold` parameter (0-64) controls similarity matching (lower = stricter, 0 = exact matches only).
- `Dataset.deduplicate_by_ids()` method for deduplication using internal `dataset_item_ids` directly, avoiding the reference ID to item ID mapping for improved efficiency.
- `DeduplicationResult` and `DeduplicationStats` dataclasses for structured deduplication results.

Example usage:

```python
dataset = client.get_dataset("ds_...")

# Deduplicate entire dataset
result = dataset.deduplicate(threshold=10)

# Deduplicate specific items by reference IDs
result = dataset.deduplicate(threshold=10, reference_ids=["ref_1", "ref_2", "ref_3"])

# Deduplicate by internal item IDs (more efficient if you have them)
result = dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=["item_1", "item_2"])

# Access results
print(f"Threshold: {result.stats.threshold}")
print(f"Original: {result.stats.original_count}, Unique: {result.stats.deduplicated_count}")
print(result.unique_reference_ids)
```

## [0.17.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.11) - 2025-11-03

### Added
Expand Down
3 changes: 3 additions & 0 deletions nucleus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"AsyncJob",
"EmbeddingsExportJob",
"BoxAnnotation",
"DeduplicationResult",
"DeduplicationStats",
"BoxPrediction",
"CameraParams",
"CategoryAnnotation",
Expand Down Expand Up @@ -128,6 +130,7 @@
from .data_transfer_object.job_status import JobInfoRequestPayload
from .dataset import Dataset
from .dataset_item import DatasetItem
from .deduplication import DeduplicationResult, DeduplicationStats
from .deprecation_warning import deprecated
from .errors import (
DatasetItemRetrievalError,
Expand Down
1 change: 1 addition & 0 deletions nucleus/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@
SLICE_TAGS_KEY = "slice_tags"
TAXONOMY_NAME_KEY = "taxonomy_name"
TASK_ID_KEY = "task_id"
THRESHOLD_KEY = "threshold"
TRACK_REFERENCE_ID_KEY = "track_reference_id"
TRACK_REFERENCE_IDS_KEY = "track_reference_ids"
TRACKS_KEY = "tracks"
Expand Down
102 changes: 102 additions & 0 deletions nucleus/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
REQUEST_ID_KEY,
SCENE_IDS_KEY,
SLICE_ID_KEY,
THRESHOLD_KEY,
TRACK_REFERENCE_IDS_KEY,
TRACKS_KEY,
TRAINED_SLICE_ID_KEY,
Expand All @@ -83,6 +84,7 @@
check_items_have_dimensions,
)
from .dataset_item_uploader import DatasetItemUploader
from .deduplication import DeduplicationResult, DeduplicationStats
from .deprecation_warning import deprecated
from .errors import NotFoundError, NucleusAPIError
from .job import CustomerJobTypes, jobs_status_overview
Expand Down Expand Up @@ -1006,6 +1008,106 @@ def create_slice_by_ids(
)
return Slice(response[SLICE_ID_KEY], self._client)

def deduplicate(
self,
threshold: int,
reference_ids: Optional[List[str]] = None,
) -> DeduplicationResult:
"""Deduplicate images or frames in this dataset.

Parameters:
threshold: Hamming distance threshold (0-64). Lower = stricter.
0 = exact matches only.
reference_ids: Optional list of reference IDs to deduplicate.
If not provided (or None), deduplicates the entire dataset.
Cannot be an empty list - use None for entire dataset.

Returns:
DeduplicationResult with unique_reference_ids, unique_item_ids, and stats.

Raises:
ValueError: If reference_ids is an empty list (use None for entire dataset).
NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive.
NucleusAPIError: If any reference_id is not found in the dataset.
NucleusAPIError: If any item is missing a perceptual hash (pHash).
Contact Scale support if this occurs.

Note:
- For scene datasets, this deduplicates the underlying scene frames,
not the scenes themselves. Frame reference IDs or dataset item IDs
should be provided for scene datasets.
- For very large datasets, this operation may take significant time.
"""
# Client-side validation
if reference_ids is not None and len(reference_ids) == 0:
raise ValueError(
"reference_ids cannot be empty. Omit reference_ids parameter to deduplicate entire dataset."
)

payload: Dict[str, Any] = {THRESHOLD_KEY: threshold}
if reference_ids is not None:
payload[REFERENCE_IDS_KEY] = reference_ids

response = self._client.make_request(
payload, f"dataset/{self.id}/deduplicate"
)
return DeduplicationResult(
unique_item_ids=response["unique_item_ids"],
unique_reference_ids=response["unique_reference_ids"],
stats=DeduplicationStats(
threshold=threshold,
original_count=response["stats"]["original_count"],
deduplicated_count=response["stats"]["deduplicated_count"],
),
)

def deduplicate_by_ids(
self,
threshold: int,
dataset_item_ids: List[str],
) -> DeduplicationResult:
"""Deduplicate images or frames by internal dataset item IDs.

Parameters:
threshold: Hamming distance threshold (0-64). Lower = stricter.
0 = exact matches only.
dataset_item_ids: List of internal dataset item IDs to deduplicate.
Must be non-empty. To deduplicate the entire dataset, refer to
the documentation for `deduplicate()` instead.

Returns:
DeduplicationResult with unique_item_ids, unique_reference_ids, and stats.

Raises:
ValueError: If dataset_item_ids is empty.
NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive.
NucleusAPIError: If any dataset_item_id is not found in the dataset.
NucleusAPIError: If any item is missing a perceptual hash (pHash).
Contact Scale support if this occurs.
"""
# Client-side validation
if not dataset_item_ids:
raise ValueError(
"dataset_item_ids must be non-empty. Use deduplicate() for entire dataset."
)

payload = {
DATASET_ITEM_IDS_KEY: dataset_item_ids,
THRESHOLD_KEY: threshold,
}
response = self._client.make_request(
payload, f"dataset/{self.id}/deduplicate"
)
return DeduplicationResult(
unique_item_ids=response["unique_item_ids"],
unique_reference_ids=response["unique_reference_ids"],
stats=DeduplicationStats(
threshold=threshold,
original_count=response["stats"]["original_count"],
deduplicated_count=response["stats"]["deduplicated_count"],
),
)

def build_slice(
self,
name: str,
Expand Down
16 changes: 16 additions & 0 deletions nucleus/deduplication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from dataclasses import dataclass
from typing import List


@dataclass
class DeduplicationStats:
threshold: int
original_count: int
deduplicated_count: int


@dataclass
class DeduplicationResult:
unique_item_ids: List[str] # Internal dataset item IDs
unique_reference_ids: List[str] # User-defined reference IDs
stats: DeduplicationStats
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running

[tool.poetry]
name = "scale-nucleus"
version = "0.17.11"
version = "0.17.12"
description = "The official Python client library for Nucleus, the Data Platform for AI"
license = "MIT"
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
Expand Down