from __future__ import annotations
"""Datasets resource for the CorePlexML SDK."""
from coreplexml._http import HTTPClient
[docs]
class DatasetsResource:
"""Manage datasets and dataset versions.
Datasets are the foundation for training experiments. Upload CSV files
and CorePlexML will version, profile, and analyze them automatically.
"""
def __init__(self, http: HTTPClient):
self._http = http
@staticmethod
def _normalize_dataset_upload(payload: dict) -> dict:
"""Normalize upload response keys for docs/examples compatibility."""
if not isinstance(payload, dict):
return {}
out = dict(payload)
if "id" not in out and out.get("dataset_id"):
out["id"] = out["dataset_id"]
if "version_id" not in out and out.get("dataset_version_id"):
out["version_id"] = out["dataset_version_id"]
return out
[docs]
def list(self, project_id: str | None = None, limit: int = 50, offset: int = 0) -> dict:
"""List datasets, optionally filtered by project.
Args:
project_id: Filter by project UUID (optional).
limit: Maximum results (default 50).
offset: Pagination offset.
Returns:
Dictionary with ``items`` list and ``total`` count.
"""
params: dict = {"limit": limit, "offset": offset}
if project_id:
params["project_id"] = project_id
return self._http.get("/api/datasets", params=params)
[docs]
def upload(self, project_id: str, file_path: str, name: str, description: str = "") -> dict:
"""Upload a CSV file as a new dataset.
Args:
project_id: UUID of the owning project.
file_path: Local path to the CSV file.
name: Display name for the dataset.
description: Optional description.
Returns:
Created dataset dictionary with ``id``, ``name``, etc.
"""
data = self._http.upload(
"/api/datasets/upload",
file_path,
fields={"project_id": project_id, "name": name, "description": description},
)
return self._normalize_dataset_upload(data)
[docs]
def get(self, dataset_id: str) -> dict:
"""Get dataset details by ID.
Args:
dataset_id: UUID of the dataset.
Returns:
Dataset dictionary.
"""
return self._http.get(f"/api/datasets/{dataset_id}")
[docs]
def versions(self, dataset_id: str) -> dict:
"""List all versions of a dataset.
Args:
dataset_id: UUID of the dataset.
Returns:
Dictionary with paginated ``items`` list plus ``total``, ``limit``, and ``offset``.
"""
return self._http.get(f"/api/datasets/{dataset_id}/versions")
[docs]
def quality(self, dataset_id: str) -> dict:
"""Get data quality report for a dataset.
Args:
dataset_id: UUID of the dataset.
Returns:
Quality metrics dictionary.
"""
return self._http.get(f"/api/datasets/{dataset_id}/quality")
[docs]
def columns(self, dataset_id: str) -> dict:
"""Get column metadata for a dataset.
Args:
dataset_id: UUID of the dataset.
Returns:
Dictionary with ``columns`` list.
"""
return self._http.get(f"/api/datasets/{dataset_id}/columns")
[docs]
def analyze(self, dataset_id: str) -> dict:
"""Run statistical analysis on a dataset.
Args:
dataset_id: UUID of the dataset.
Returns:
Analysis results dictionary.
"""
return self._http.get(f"/api/datasets/{dataset_id}/analyze")
[docs]
def delete(self, dataset_id: str) -> dict:
"""Delete a dataset.
Args:
dataset_id: UUID of the dataset.
Returns:
Empty dictionary on success.
"""
return self._http.delete(f"/api/datasets/{dataset_id}")
[docs]
def download(self, dataset_id: str, output_path: str, format: str = "csv") -> str:
"""Download dataset to a local file.
Args:
dataset_id: UUID of the dataset.
output_path: Local path to save the file.
format: Output format -- ``csv`` or ``parquet`` (default ``csv``).
Returns:
The output_path on success.
"""
return self._http.download(
f"/api/datasets/{dataset_id}/download",
output_path,
params={"format": format},
)