Source code for olympus.datasets.dataset

from collections import defaultdict

from torch.utils.data.dataset import Dataset as TorchDataset
from typing import Callable


[docs]class AllDataset(TorchDataset):
    """Olympus data sets are concatenated data sets that includes train, validation and test sets
    This allow us to change how each sets are splits and give us greater power to design performance
    tests.

    Read more on how Olympus uses custom splits to evaluate model performance at :ref XYZ

    Attributes
    ----------
    dataset: TorchDataset
        Underlying dataset (concatenation of original train and test sets)

    collate_fn: Optional[Callable] !! static method !!
        merges a list of samples to form a mini-batch of Tensor(s).  Used when using batched loading from a
        map-style dataset.

    """
    # Underlying Pytorch dataset
    dataset: TorchDataset = None

    # Callable object that apply a transformation on each sample of the data set
    # if you are looking to add data augmentation step you should be looking at
    # preprocessor
    transforms: Callable = lambda sample: sample
    collate_fn: Callable = None

    def __init__(self, dataset, data_path=None, input_shape=None, target_shape=None,
                 train_size=None, valid_size=None, test_size=None, transforms=None):
        self.dataset = dataset
        self._input_shape = input_shape
        self._train_size = train_size
        self._valid_size = valid_size
        self._test_size = test_size
        self._input_shape = input_shape
        self._target_shape = target_shape

        if transforms is None:
            transforms = lambda data: data

        if not isinstance(transforms, dict):
            transforms = dict(train=transforms, valid=transforms, test=transforms)

        if 'valid' not in transforms:
            transforms['valid'] = transforms['test']

        self.transforms = transforms

    @property
    def train_size(self):
        """Size of the training set"""
        if self._train_size is None:
            return len(self) - self.test_size - self.valid_size
        return self._train_size

    @property
    def valid_size(self):
        """Size of the validation set"""
        if self._valid_size is None:
            return self.test_size
        return self._valid_size

    @property
    def test_size(self):
        """Size of the test set"""
        return self._test_size

    def __getitem__(self, idx):
        """Return a sample from the entire dataset"""
        return self.dataset[idx]

    def __len__(self):
        """Return the number of samples inside the dataset"""
        if self._train_size is None:
            return len(self.dataset)
        return self.valid_size + self.train_size + self.test_size

    @property
    def input_shape(self):
        """Return the size of the samples"""
        if self._input_shape is None:
            return tuple(self.transforms['train'](self.dataset[0][0]).shape)

        return self._input_shape

    @property
    def target_shape(self):
        """Return the size of the target"""
        if self._target_shape is None:
            if isinstance(self.dataset[0][1], int):
                self._target_shape = (len(self.classes), )
            else:
                self._target_shape = self.dataset[0][1].shape

        return self._target_shape

    @property
    def classes(self):
        """Return the mapping between samples index and their class"""
        classes = defaultdict(list)

        for index, [_, y] in enumerate(self.dataset):
            classes[y].append(index)

        return [classes[i] for i in sorted(classes.keys())]

[docs]    @staticmethod
    def categories():
        """Dataset tags so we can filter what we want depending on the task"""
        return set()