Source code for zea.data.convert.cetus

"""Functionality to convert the CETUS dataset to the zea format.

.. note::
    Requires SimpleITK to be installed: ``pip install SimpleITK``.

The CETUS (Challenge on Endocardial Three-dimensional Ultrasound Segmentation)
dataset contains 3D echocardiographic volumes from 45 patients. Each patient has
end-diastolic (ED) and end-systolic (ES) B-mode volumes with corresponding
ground truth left ventricle segmentation masks. The volumes are stored in NIfTI
(.nii.gz) format with isotropic voxel spacing.

**License**: `CC BY-NC-SA 4.0 <https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode>`_

The CETUS dataset is available free of charge strictly for non-commercial
scientific research purposes only.

**Citation** (required for any use of the CETUS database):

    O. Bernard, et al.
    "Standardized Evaluation System for Left Ventricular Segmentation Algorithms
    in 3D Echocardiography"
    IEEE Transactions on Medical Imaging, vol. 35, no. 4, pp. 967-977, April 2016.
    `DOI: 10.1109/tmi.2015.2503890 <https://doi.org/10.1109/tmi.2015.2503890>`_

**Links**:

- `MICCAI 2014 CETUS Challenge <https://www.creatis.insa-lyon.fr/Challenge/CETUS/>`_
- `Original dataset <https://humanheart-project.creatis.insa-lyon.fr/database/#collection/62eb991b73e9f0048c3a6c45>`_

"""

from __future__ import annotations

import os
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import numpy as np
from tqdm import tqdm

from zea import log
from zea.data.convert.utils import download_from_girder, sitk_load
from zea.data.file import File

# Citation text for inclusion in every converted file
CETUS_CITATION = (
    'O. Bernard, et al. "Standardized Evaluation System for Left Ventricular '
    'Segmentation Algorithms in 3D Echocardiography" in IEEE Transactions on '
    "Medical Imaging, vol. 35, no. 4, pp. 967-977, April 2016. "
    "https://doi.org/10.1109/tmi.2015.2503890"
)

CETUS_LICENSE = "CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode)"

CETUS_DESCRIPTION = (
    "CETUS (Challenge on Endocardial Three-dimensional Ultrasound Segmentation) "
    "3D echocardiographic dataset converted to zea format. "
    "License: {license}. "
    "Citation: {citation}"
).format(license=CETUS_LICENSE, citation=CETUS_CITATION)

# Girder collection ID for the CETUS dataset
_CETUS_COLLECTION_ID = "62eb991b73e9f0048c3a6c45"

# Dataset splits: patient IDs 1-30 for training, 31-38 for validation, 39-45 for test
splits = {"train": [1, 31], "val": [31, 39], "test": [39, 46]}


[docs] def get_split(patient_id: int) -> str: """Determine which dataset split a patient ID belongs to. Args: patient_id: Integer ID of the patient (1-45). Returns: The split name: ``"train"``, ``"val"``, or ``"test"``. Raises: ValueError: If the patient_id does not fall into any defined split range. """ for split_name, (start, end) in splits.items(): if start <= patient_id < end: return split_name raise ValueError(f"Did not find split for patient: {patient_id}")
def _detect_background_level(volume: np.ndarray) -> float: """Detect the background padding value of a CETUS volume. The CETUS volumes are zero-padded outside the scanning cone, but the padding value is not exactly zero — it varies per file (e.g. 8 or 13 on a [0, 255] scale). This function finds the mode of the integer-binned histogram which corresponds to the dominant background intensity. Args: volume: 3-D numpy array with values in [0, 255]. Returns: The detected background intensity level. """ # Use integer bins (0..255) — the padding value is always a single integer counts, bin_edges = np.histogram(volume.ravel(), bins=256, range=(0, 256)) bg_level = float(bin_edges[np.argmax(counts)]) return bg_level
[docs] def process_cetus(source_path, output_path, overwrite=False): """Convert a single CETUS patient time-point to a zea HDF5 file. Each file stores the 3D B-mode volume as ``image_sc`` (scan-converted image). If a corresponding ground truth segmentation file exists, it is stored as a ``Segmentation`` map under ``data/segmentation`` with spatial extent derived from the NIfTI voxel spacing. Patient ID and citation are stored in the ``metadata`` group. License information is embedded in the file description. Args: source_path (str or Path): Path to the source ``.nii.gz`` B-mode file. output_path (str or Path): Path to the output ``.hdf5`` file. overwrite (bool, optional): Whether to overwrite an existing output file. Defaults to False. """ source_path = Path(source_path) output_path = Path(output_path) # Check if output file already exists if output_path.exists(): if overwrite: os.remove(output_path) else: log.info(f"Output file {output_path} already exists. Skipping.") return # Load B-mode volume volume, metadata = sitk_load(source_path) # volume shape: (depth, height, width) — 3D # Voxel spacing in meters (NIfTI stores in mm-like units depending on header; # CETUS uses meters based on the spacing values ~0.0005763) voxel_spacing = np.array(metadata["spacing"], dtype=np.float64) # The CETUS volumes have a background padding value that is nonzero and varies per file. # Here we detect it from the histogram and create a binary mask so that # background voxels are mapped to exactly -60 dB (pure black). bg_level = int(_detect_background_level(volume)) bg_mask = volume.astype(int) == bg_level # Convert B-mode intensity [0, 255] to dB range [-60, 0]. volume_db = (volume / 255.0) * 60.0 - 60.0 volume_db[bg_mask] = -60.0 # Store as image_sc with shape (n_frames, depth, height, width). # For 3D volumes, n_frames=1 (single time point: ED or ES). image_sc = volume_db[np.newaxis, ...] # (1, D, H, W) # Check for corresponding ground truth segmentation gt_path = source_path.with_name(source_path.name.replace(".nii.gz", "_gt.nii.gz")) # Extract patient and time-point info from filename stem = source_path.stem # e.g. "patient01_ED.nii" -> stem is "patient01_ED" if stem.endswith(".nii"): stem = stem[:-4] # remove .nii if present from double suffix time_point = stem.split("_")[-1] # "ED" or "ES" patient_name = stem.split("_")[0] # e.g. "patient01" # Build data dict # Compute spatial extent from voxel spacing: (xmin, xmax, ymin, ymax, zmax, zmin) D, H, W = volume.shape image_sc_extent = np.array( [0, D * voxel_spacing[0], 0, W * voxel_spacing[2], 0, H * voxel_spacing[1]], dtype=np.float32, ) data = { "image_sc": { "values": image_sc.astype(np.float32), "extent": image_sc_extent, } } if gt_path.exists(): gt_volume, _ = sitk_load(gt_path) # GT is binary: 0 or 255 -> bool mask, shape (1, D, H, W, 1) seg_mask = (gt_volume > 0)[np.newaxis, ..., np.newaxis] # Compute spatial extent from voxel spacing: (xmin, xmax, ymin, ymax, zmax, zmin) extent = np.array( [0, D * voxel_spacing[0], 0, W * voxel_spacing[2], 0, H * voxel_spacing[1]], dtype=np.float32, ) data["segmentation"] = { "values": seg_mask, "extent": extent, "labels": np.array(["endocardium"]), } # Build description for this file file_description = ( f"CETUS dataset - {patient_name} {time_point} - " f"3D echocardiographic volume converted to zea format. " f"Voxel spacing: {voxel_spacing.tolist()} m. " f"License: {CETUS_LICENSE}. " f"Citation: {CETUS_CITATION}" ) File.create( path=output_path, data=data, metadata={ "subject": {"id": patient_name}, "credit": CETUS_CITATION, "annotations": {"label": np.array([time_point])}, }, probe_name="generic", description=file_description, overwrite=overwrite, )
def _process_task(task): """Unpack a task tuple and invoke process_cetus in a worker process. Args: task (tuple): ``(source_file_str, output_file_str)`` """ source_file_str, output_file_str = task source_file = Path(source_file_str) output_file = Path(output_file_str) output_file.parent.mkdir(parents=True, exist_ok=True) try: process_cetus(source_file, output_file, overwrite=False) except Exception: log.error("Error processing %s", source_file) raise
[docs] def download_cetus( # pragma: no cover destination: str | Path, patients: list[int] | None = None ) -> Path: """Download the CETUS dataset from the Girder server. Downloads NIfTI files for each patient (B-mode volumes and ground truth segmentations for ED and ES time points). Args: destination: Directory where the dataset will be downloaded. patients: List of patient IDs to download (1-45). If None, all 45 patients are downloaded. Returns: Path to the downloaded dataset directory. """ return download_from_girder( collection_id=_CETUS_COLLECTION_ID, destination=destination, dataset_name="CETUS", patients=patients, )
[docs] def convert_cetus(args): """Convert the CETUS dataset into zea HDF5 files across dataset splits. Processes all NIfTI B-mode volumes found under the source folder, assigns each patient to a train/val/test split, and executes per-file conversion tasks either serially or in parallel. Usage:: python -m zea.data.convert cetus <source_folder> <destination_folder> --download Args: args (argparse.Namespace): An object with attributes: - src (str | Path): Path to the folder containing CETUS patient subfolders, or a directory to download into when ``--download`` is set. - dst (str | Path): Root destination folder for zea HDF5 outputs; split subfolders (train/val/test) will be created. - download (bool, optional): If True, download the dataset first from the Girder server. - no_hyperthreading (bool, optional): If True, run tasks serially instead of using a process pool. - upload (bool, optional): If True, upload the converted dataset to HuggingFace Hub after conversion. Only for zea maintainers with push access to the repository. """ cetus_source_folder = Path(args.src) cetus_output_folder = Path(args.dst) # Optionally download the dataset if getattr(args, "download", False): cetus_source_folder = download_cetus(cetus_source_folder) if not cetus_source_folder.exists(): raise FileNotFoundError( f"Source folder does not exist: {cetus_source_folder}. " "Use --download to download the CETUS dataset automatically." ) # Check if output folders already exist for split in splits: split_dir = cetus_output_folder / split if split_dir.exists(): log.warning( f"Output folder {split_dir} already exists. Existing files will be skipped." ) # Find all B-mode NIfTI files (exclude ground truth files ending with _gt.nii.gz) files = sorted(cetus_source_folder.glob("**/*_ED.nii.gz")) + sorted( cetus_source_folder.glob("**/*_ES.nii.gz") ) tasks = [] for source_file in files: patient_name = source_file.stem.split("_")[0] # e.g. "patient01" if source_file.stem.endswith(".nii"): # Handle double suffix: .nii.gz -> stem is "patient01_ED.nii" patient_name = source_file.name.split("_")[0] patient_id = int(patient_name.removeprefix("patient")) split = get_split(patient_id) # Build output filename output_name = source_file.name.replace(".nii.gz", ".hdf5") output_file = cetus_output_folder / split / patient_name / output_name output_file.parent.mkdir(parents=True, exist_ok=True) tasks.append((str(source_file), str(output_file))) if not tasks: log.info("No CETUS files found to process.") return log.info(f"Found {len(tasks)} files to convert.") if getattr(args, "no_hyperthreading", False): log.info("Running tasks serially (no ProcessPoolExecutor)") for t in tqdm(tasks, desc="Processing files (serial)"): try: _process_task(t) except Exception as exc: log.error(f"Failed to process {t[0]}: {exc}") log.info(f"Processing finished for {len(tasks)} files (serial)") if getattr(args, "upload", False): upload_cetus(cetus_output_folder) return # Parallel processing with ProcessPoolExecutor() as exe: futures = [exe.submit(_process_task, t) for t in tasks] for future in tqdm(futures, desc="Processing files"): try: future.result() except Exception as exc: log.error(f"Failed to process a file: {exc}") log.info(f"Processing finished for {len(tasks)} files") if getattr(args, "upload", False): upload_cetus(cetus_output_folder)
# --------------------------------------------------------------------------- # HuggingFace Hub upload # --------------------------------------------------------------------------- _HF_REPO_ID = "zeahub/cetus-miccai-2014" _DATASET_CARD = """\ --- license: cc-by-nc-sa-4.0 task_categories: - image-segmentation tags: - ultrasound - echocardiography - 3d - cardiac - medical pretty_name: "CETUS: Challenge on Endocardial Three-dimensional Ultrasound Segmentation" size_categories: - n<1K --- # CETUS - 3-D Echocardiographic Ultrasound Dataset This dataset is a **zea-format** (HDF5) conversion of the [CETUS (MICCAI 2014)](https://www.creatis.insa-lyon.fr/Challenge/CETUS/) challenge data for endocardial segmentation in 3-D echocardiography. | Property | Value | |---|---| | **Modality** | 3-D transthoracic echocardiography | | **Patients** | 45 | | **Time points** | End-diastole (ED) and end-systole (ES) per patient | | **Files** | 90 HDF5 volumes (45 patients x 2 time points) | | **Voxel spacing** | Isotropic, ~0.576 mm (varies per patient) | | **Segmentation** | Left-ventricle endocardial surface (binary) | | **Splits** | train (1-30), val (31-38), test (39-45) | ## Conversion This dataset was downloaded, converted to zea format, and uploaded using the [zea](https://github.com/tue-bmd/zea) data converter: ```bash python -m zea.data.convert cetus <src> <dst> --download ``` ## Dataset structure ``` train/ patient01/ patient01_ED.hdf5 patient01_ES.hdf5 ... val/ patient31/ ... test/ patient39/ ... ``` Each HDF5 file follows the [zea data format](https://github.com/tue-bmd/zea) and contains: - `data/image_sc` - B-mode volume in dB, shape `(1, depth, height, width)` - `non_standard_elements/segmentation` - binary LV mask, same shape - `non_standard_elements/voxel_spacing` - `(x, y, z)` in metres - `non_standard_elements/patient_id`, `time_point`, `citation`, `license` ## License **CC BY-NC-SA 4.0** - <https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode> The CETUS dataset is available free of charge strictly for **non-commercial scientific research purposes only**. ## Citation If you use this dataset, please cite the original CETUS paper: ```bibtex @article{{bernard2016standardized, title = {{Standardized Evaluation System for Left Ventricular Segmentation Algorithms in 3D Echocardiography}}, author = {{Bernard, Olivier and Bosch, Johan G. and Heyde, Brecht and Alessandrini, Martino and Barbosa, Daniel and Camarasu-Pop, Sorina and Cervenansky, Fr{{\\'e}}d{{\\'e}}ric and Valette, S{{\\'e}}bastien and Mirea, Oana and Berber, Merih and others}}, journal = {{IEEE Transactions on Medical Imaging}}, volume = {{35}}, number = {{4}}, pages = {{967--977}}, year = {{2016}}, doi = {{10.1109/tmi.2015.2503890}} }} ``` ## Links - **Original challenge**: <https://www.creatis.insa-lyon.fr/Challenge/CETUS/> - **Original dataset**: <https://humanheart-project.creatis.insa-lyon.fr/database/#collection/62eb991b73e9f0048c3a6c45> - **zea toolkit**: <https://github.com/tue-bmd/zea> """ def _write_dataset_card(folder: Path) -> Path: # pragma: no cover """Write the HuggingFace dataset card (README.md) into *folder*.""" card_path = folder / "README.md" card_path.write_text(_DATASET_CARD) return card_path
[docs] def upload_cetus(output_folder: str | Path) -> None: # pragma: no cover """Upload the converted CETUS dataset to HuggingFace Hub. Only for zea maintainers with push access to the repository. Writes a dataset card, prints an upload summary, and asks for confirmation before pushing. Args: output_folder: Root folder containing the train/val/test splits. """ from huggingface_hub import HfApi, login output_folder = Path(output_folder) # Collect files to upload hdf5_files = sorted(output_folder.rglob("*.hdf5")) if not hdf5_files: raise FileNotFoundError(f"No HDF5 files found in {output_folder}") total_size_mb = sum(f.stat().st_size for f in hdf5_files) / 1e6 split_counts = {} for f in hdf5_files: split = f.relative_to(output_folder).parts[0] split_counts[split] = split_counts.get(split, 0) + 1 # Write dataset card _write_dataset_card(output_folder) # Print summary and ask for confirmation log.info("") log.info("=" * 60) log.info(" CETUS upload summary") log.info("=" * 60) log.info(f" Repository : {_HF_REPO_ID}") log.info(f" Source : {output_folder}") log.info(f" Files : {len(hdf5_files)} HDF5 + README.md") for split, count in sorted(split_counts.items()): log.info(f" {split:>5s}: {count} files") log.info(f" Total size : {total_size_mb:.1f} MB") log.info(f" License : {CETUS_LICENSE}") log.info("=" * 60) log.info("") answer = input("Proceed with upload? [y/N] ").strip().lower() if answer != "y": log.info("Upload cancelled.") return login(new_session=False) api = HfApi() api.upload_folder( folder_path=str(output_folder), repo_id=_HF_REPO_ID, repo_type="dataset", commit_message="Upload CETUS dataset (zea format)", ) log.info(f"Dataset uploaded to https://huggingface.co/datasets/{_HF_REPO_ID}")