Source code for olympus.datasets.emnist

from filelock import FileLock
import torch
from torchvision import datasets, transforms

from olympus.datasets.dataset import AllDataset
from olympus.utils import option


[docs]class BalancedEMNIST(AllDataset):
    """The MNIST database was derived from a larger dataset known as the NIST Special Database 19 which contains digits,
    uppercase and lowercase handwritten letters. This paper introduces a variant of the full NIST dataset,
    which we have called Extended MNIST (EMNIST), which follows the same conversion paradigm used to create
    the MNIST dataset. The result is a set of datasets that constitute a more challenging classification
    tasks involving letters and digits.
    More on `arxiv <https://arxiv.org/abs/1702.05373>`_.

    See also :class:`.MNIST` and :class:`.FashionMNIST`

    Attributes
    ----------
    classes: List[int]
        Return the mapping between samples index and their class

    input_shape: (28, 28)
        Size of a sample stored in this dataset

    target_shape: (47,)
        The dataset is composed of 47 classes, 10 digits, 37 letters

    train_size: 94000
        Size of the train dataset

    valid_size: 18800
        Size of the validation dataset

    test_size: 18800
        Size of the test dataset

    References
    ----------
    .. [1] Gregory Cohen, Saeed Afshar, Jonathan Tapson, André van Schaik.
        "EMNIST: an extension of MNIST to handwritten letters", Mar 2017

    """
    def __init__(self, data_path):
        with FileLock('EMNIST.lock', timeout=option('download.lock.timeout', 4 * 60, type=int)):
            train_dataset = datasets.EMNIST(
                data_path, train=True, download=True, split='balanced',
                transform=transforms.ToTensor())

        with FileLock('EMNIST.lock', timeout=option('download.lock.timeout', 4 * 60, type=int)):
            test_dataset = datasets.EMNIST(
                data_path, train=False, download=True, split='balanced',
                transform=transforms.ToTensor())

        super(BalancedEMNIST, self).__init__(
            torch.utils.data.ConcatDataset([train_dataset, test_dataset]),
            test_size=len(test_dataset))

[docs]    @staticmethod
    def categories():
        return set(['classification'])


builders = {
    'balanced_emnist': BalancedEMNIST}