Source code for coreplexml.datasets

from __future__ import annotations

"""Datasets resource for the CorePlexML SDK."""
from coreplexml._http import HTTPClient



[docs]
class DatasetsResource:
    """Manage datasets and dataset versions.

    Datasets are the foundation for training experiments. Upload CSV files
    and CorePlexML will version, profile, and analyze them automatically.
    """

    def __init__(self, http: HTTPClient):
        self._http = http

    @staticmethod
    def _normalize_dataset_upload(payload: dict) -> dict:
        """Normalize upload response keys for docs/examples compatibility."""
        if not isinstance(payload, dict):
            return {}
        out = dict(payload)
        if "id" not in out and out.get("dataset_id"):
            out["id"] = out["dataset_id"]
        if "version_id" not in out and out.get("dataset_version_id"):
            out["version_id"] = out["dataset_version_id"]
        return out


[docs]
    def list(self, project_id: str | None = None, limit: int = 50, offset: int = 0) -> dict:
        """List datasets, optionally filtered by project.

        Args:
            project_id: Filter by project UUID (optional).
            limit: Maximum results (default 50).
            offset: Pagination offset.

        Returns:
            Dictionary with ``items`` list and ``total`` count.
        """
        params: dict = {"limit": limit, "offset": offset}
        if project_id:
            params["project_id"] = project_id
        return self._http.get("/api/datasets", params=params)



[docs]
    def upload(self, project_id: str, file_path: str, name: str, description: str = "") -> dict:
        """Upload a CSV file as a new dataset.

        Args:
            project_id: UUID of the owning project.
            file_path: Local path to the CSV file.
            name: Display name for the dataset.
            description: Optional description.

        Returns:
            Created dataset dictionary with ``id``, ``name``, etc.
        """
        data = self._http.upload(
            "/api/datasets/upload",
            file_path,
            fields={"project_id": project_id, "name": name, "description": description},
        )
        return self._normalize_dataset_upload(data)



[docs]
    def get(self, dataset_id: str) -> dict:
        """Get dataset details by ID.

        Args:
            dataset_id: UUID of the dataset.

        Returns:
            Dataset dictionary.
        """
        return self._http.get(f"/api/datasets/{dataset_id}")



[docs]
    def versions(self, dataset_id: str) -> dict:
        """List all versions of a dataset.

        Args:
            dataset_id: UUID of the dataset.

        Returns:
            Dictionary with paginated ``items`` list plus ``total``, ``limit``, and ``offset``.
        """
        return self._http.get(f"/api/datasets/{dataset_id}/versions")



[docs]
    def quality(self, dataset_id: str) -> dict:
        """Get data quality report for a dataset.

        Args:
            dataset_id: UUID of the dataset.

        Returns:
            Quality metrics dictionary.
        """
        return self._http.get(f"/api/datasets/{dataset_id}/quality")



[docs]
    def columns(self, dataset_id: str) -> dict:
        """Get column metadata for a dataset.

        Args:
            dataset_id: UUID of the dataset.

        Returns:
            Dictionary with ``columns`` list.
        """
        return self._http.get(f"/api/datasets/{dataset_id}/columns")



[docs]
    def analyze(self, dataset_id: str) -> dict:
        """Run statistical analysis on a dataset.

        Args:
            dataset_id: UUID of the dataset.

        Returns:
            Analysis results dictionary.
        """
        return self._http.get(f"/api/datasets/{dataset_id}/analyze")



[docs]
    def delete(self, dataset_id: str) -> dict:
        """Delete a dataset.

        Args:
            dataset_id: UUID of the dataset.

        Returns:
            Empty dictionary on success.
        """
        return self._http.delete(f"/api/datasets/{dataset_id}")



[docs]
    def download(self, dataset_id: str, output_path: str, format: str = "csv") -> str:
        """Download dataset to a local file.

        Args:
            dataset_id: UUID of the dataset.
            output_path: Local path to save the file.
            format: Output format -- ``csv`` or ``parquet`` (default ``csv``).

        Returns:
            The output_path on success.
        """
        return self._http.download(
            f"/api/datasets/{dataset_id}/download",
            output_path,
            params={"format": format},
        )