Source code for olympus.datasets.tinyimagenet

from collections import OrderedDict
import csv
import functools
import os
import urllib
import zipfile
import shutil
import time

from filelock import FileLock, Timeout

from PIL import Image

import h5py

import numpy

from tqdm import tqdm

import torch
from torchvision import datasets, transforms
import torchvision.transforms.functional as F

from olympus.datasets.dataset import AllDataset
from olympus.datasets.tensorhdf5 import HDF5Dataset
from olympus.transforms import to_pil_image
from olympus.utils import option

# download-url: http://cs231n.stanford.edu/tiny-imagenet-200.zip

# Train: 100000
# Val:    10000
# Train:  10000

DIRNAME = 'tiny-imagenet-200'
ZIP_FILENAME = 'tiny-imagenet-200.zip'
TRAIN_FILENAME = 'tinyimagenet_train.h5'
VAL_FILENAME = 'tinyimagenet_val.h5'
# TEST_FILENAME = 'tinyimagenet_test.h5'


[docs]def get_zipfile_path(data_path):
    return os.path.join(data_path, ZIP_FILENAME)


[docs]def get_dirpath(data_path):
    return os.path.join(data_path, DIRNAME)


[docs]def all_hdf5_exists(data_path):
    return all(os.path.exists(os.path.join(data_path, filename))
               for filename in [TRAIN_FILENAME, VAL_FILENAME])


[docs]def build_dataset(data_path, timeout=10 * 60):
    if all_hdf5_exists(data_path):
        return

    try:
        with FileLock(os.path.join(data_path, DIRNAME + ".lock"), timeout=option('download.lock.timeout', timeout, type=int)):
            download(data_path)
            unzip(data_path)
            create_hdf5(data_path)
    except Timeout:
        print("Another process holds the lock since more than {} seconds. "
              "Will try to load the dataset.").format(timeout)
    finally:
        clean(data_path)


[docs]def download(data_path):
    if os.path.exists(get_zipfile_path(data_path)):
        print("Zip file already downloaded")
        return

    # download
    url = 'http://cs231n.stanford.edu/tiny-imagenet-200.zip'
    u = urllib.request.urlopen(url)
    with open(get_zipfile_path(data_path), 'wb') as f:
        file_size = int(dict(u.getheaders())['Content-Length']) / (10.0**6)
        print("Downloading: {} ({}MB)".format(get_zipfile_path(data_path), file_size))

        file_size_dl = 0
        block_sz = 8192
        pbar = tqdm(total=file_size, desc='TinyImageNet')
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
            f.write(buffer)
            pbar.update(len(buffer) / (10.0 ** 6))

        pbar.close()


[docs]def unzip(data_path):
    print("Unzipping files...")
    with zipfile.ZipFile(get_zipfile_path(data_path), 'r') as zip_ref:
        zip_ref.extractall(data_path)
    print("Done")


[docs]def clean(data_path):
    print("Deleting unzipped files...")
    shutil.rmtree(get_dirpath(data_path))


[docs]def create_hdf5(data_path):
    create_hdf5_train(
        get_dirpath(data_path), os.path.join(data_path, 'tinyimagenet_train.h5'))

    create_hdf5_val(
        get_dirpath(data_path), os.path.join(data_path, 'tinyimagenet_val.h5'))


[docs]def create_train_loader(dirpath):
    dataset = datasets.ImageFolder(
        os.path.join(dirpath, 'train'),
        transforms.Compose([transforms.ToTensor()]))

    dataloader = torch.utils.data.DataLoader(
        dataset=dataset, batch_size=1, num_workers=1)

    for batch in dataloader:
        yield batch


[docs]def create_hdf5_file(dirpath, file_path, n, dataloader):
    f = h5py.File(file_path, 'w', libver='latest')

    data = f.create_dataset(
        "data", (n, 64, 64, 3),
        chunks=(1, 64, 64, 3),
        dtype=numpy.uint8)
        # compression='lzf')
    labels = f.create_dataset("labels", (n, ), dtype=numpy.uint8)

    f.swmr_mode = True

    for index, (x, y) in enumerate(tqdm(dataloader, total=n, desc='HDF5')):
        x = numpy.array(x * 255, dtype=numpy.uint8)
        data[index] = numpy.moveaxis(x, 1, -1)
        labels[index] = y

    f.close()


[docs]def create_hdf5_train(dirpath, file_path):
    return create_hdf5_file(dirpath, file_path, 100000, create_train_loader(dirpath))


[docs]def create_hdf5_val(dirpath, file_path):
    return create_hdf5_file(dirpath, file_path, 10000, create_val_loader(dirpath))


[docs]def create_val_loader(dirpath):

    train_dataset = datasets.ImageFolder(
        os.path.join(dirpath, 'train'),
        transforms.Compose([transforms.ToTensor()]))

    with open(os.path.join(dirpath, 'val', 'val_annotations.txt'), 'r') as f:
        csv_reader = csv.reader(f, delimiter='\t')

        for index, row in enumerate(csv_reader):
            filename = row[0]
            class_id = row[1]

            image_path = os.path.join(dirpath, 'val', 'images', filename)
            with open(image_path, 'rb') as f:
                img = Image.open(f)
                img = img.convert('RGB')
                x = F.to_tensor(img).unsqueeze(0)

            yield x, train_dataset.class_to_idx[class_id]


[docs]class TinyImageNet(AllDataset):
    """Tiny Imagenet has 200 classes. Each class has 500 training images, 50 validation images, and 50 test images.
    We have released the training and validation sets with images and annotations.
    We provide both class labels and bounding boxes as annotations;
    however, you are asked only to predict the class label of each image without localizing the objects.
    The test set is released without labels. More at `tiny-imagenet <https://tiny-imagenet.herokuapp.com/>`_.

    Attributes
    ----------
    classes: List[int]
        Return the mapping between samples index and their class

    input_shape: (3, 64, 64)
        Size of a sample stored in this dataset

    target_shape: (200,)
        The dataset is composed of 200 classes

    train_size: 90000
        Size of the train dataset

    valid_size: 10000
        Size of the validation dataset

    test_size: 10000
        Size of the test dataset

    References
    ----------
    .. [1] Jiayu Wu, Qixiang Zhang, Guoxi Xu. "Tiny ImageNet Challenge", 2017

    """
    def __init__(self, data_path):
        build_dataset(data_path)

        base_transformations = transforms.Compose([
            # data is stored as uint8
            to_pil_image,
            transforms.CenterCrop(64),
            transforms.ToTensor()])

        transformations = [
            transforms.Normalize(
                mean=[0.4194, 0.3898, 0.3454],
                std=[0.303, 0.291, 0.293])]

        train_transform = [
            to_pil_image,
            transforms.RandomCrop(64, padding=8),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            ] + transformations

        transformations = dict(
            train=transforms.Compose(train_transform),
            valid=transforms.Compose(transformations),
            test=transforms.Compose(transformations))

        train_dataset = HDF5Dataset(
            os.path.join(data_path, TRAIN_FILENAME),
            base_transformations,
            transforms.Lambda(lambda x: int(x)))

        test_dataset = HDF5Dataset(
            os.path.join(data_path, VAL_FILENAME),
            base_transformations,
            transforms.Lambda(lambda x: int(x)))

        super(TinyImageNet, self).__init__(
            torch.utils.data.ConcatDataset([train_dataset, test_dataset]),
            test_size=len(test_dataset),
            transforms=transformations,
            target_shape=(200, ),
        )

[docs]    @staticmethod
    def categories():
        return set(['classification'])


builders = {
    'tinyimagenet': TinyImageNet}