Source code for olympus.datasets.tinyimagenet

from collections import OrderedDict
import csv
import functools
import os
import urllib
import zipfile
import shutil
import time

from filelock import FileLock, Timeout

from PIL import Image

import h5py

import numpy

from tqdm import tqdm

import torch
from torchvision import datasets, transforms
import torchvision.transforms.functional as F

from olympus.datasets.dataset import AllDataset
from olympus.datasets.tensorhdf5 import HDF5Dataset
from olympus.transforms import to_pil_image
from olympus.utils import option

# download-url: http://cs231n.stanford.edu/tiny-imagenet-200.zip

# Train: 100000
# Val:    10000
# Train:  10000

DIRNAME = 'tiny-imagenet-200'
ZIP_FILENAME = 'tiny-imagenet-200.zip'
TRAIN_FILENAME = 'tinyimagenet_train.h5'
VAL_FILENAME = 'tinyimagenet_val.h5'
# TEST_FILENAME = 'tinyimagenet_test.h5'


[docs]def get_zipfile_path(data_path): return os.path.join(data_path, ZIP_FILENAME)
[docs]def get_dirpath(data_path): return os.path.join(data_path, DIRNAME)
[docs]def all_hdf5_exists(data_path): return all(os.path.exists(os.path.join(data_path, filename)) for filename in [TRAIN_FILENAME, VAL_FILENAME])
[docs]def build_dataset(data_path, timeout=10 * 60): if all_hdf5_exists(data_path): return try: with FileLock(os.path.join(data_path, DIRNAME + ".lock"), timeout=option('download.lock.timeout', timeout, type=int)): download(data_path) unzip(data_path) create_hdf5(data_path) except Timeout: print("Another process holds the lock since more than {} seconds. " "Will try to load the dataset.").format(timeout) finally: clean(data_path)
[docs]def download(data_path): if os.path.exists(get_zipfile_path(data_path)): print("Zip file already downloaded") return # download url = 'http://cs231n.stanford.edu/tiny-imagenet-200.zip' u = urllib.request.urlopen(url) with open(get_zipfile_path(data_path), 'wb') as f: file_size = int(dict(u.getheaders())['Content-Length']) / (10.0**6) print("Downloading: {} ({}MB)".format(get_zipfile_path(data_path), file_size)) file_size_dl = 0 block_sz = 8192 pbar = tqdm(total=file_size, desc='TinyImageNet') while True: buffer = u.read(block_sz) if not buffer: break f.write(buffer) pbar.update(len(buffer) / (10.0 ** 6)) pbar.close()
[docs]def unzip(data_path): print("Unzipping files...") with zipfile.ZipFile(get_zipfile_path(data_path), 'r') as zip_ref: zip_ref.extractall(data_path) print("Done")
[docs]def clean(data_path): print("Deleting unzipped files...") shutil.rmtree(get_dirpath(data_path))
[docs]def create_hdf5(data_path): create_hdf5_train( get_dirpath(data_path), os.path.join(data_path, 'tinyimagenet_train.h5')) create_hdf5_val( get_dirpath(data_path), os.path.join(data_path, 'tinyimagenet_val.h5'))
[docs]def create_train_loader(dirpath): dataset = datasets.ImageFolder( os.path.join(dirpath, 'train'), transforms.Compose([transforms.ToTensor()])) dataloader = torch.utils.data.DataLoader( dataset=dataset, batch_size=1, num_workers=1) for batch in dataloader: yield batch
[docs]def create_hdf5_file(dirpath, file_path, n, dataloader): f = h5py.File(file_path, 'w', libver='latest') data = f.create_dataset( "data", (n, 64, 64, 3), chunks=(1, 64, 64, 3), dtype=numpy.uint8) # compression='lzf') labels = f.create_dataset("labels", (n, ), dtype=numpy.uint8) f.swmr_mode = True for index, (x, y) in enumerate(tqdm(dataloader, total=n, desc='HDF5')): x = numpy.array(x * 255, dtype=numpy.uint8) data[index] = numpy.moveaxis(x, 1, -1) labels[index] = y f.close()
[docs]def create_hdf5_train(dirpath, file_path): return create_hdf5_file(dirpath, file_path, 100000, create_train_loader(dirpath))
[docs]def create_hdf5_val(dirpath, file_path): return create_hdf5_file(dirpath, file_path, 10000, create_val_loader(dirpath))
[docs]def create_val_loader(dirpath): train_dataset = datasets.ImageFolder( os.path.join(dirpath, 'train'), transforms.Compose([transforms.ToTensor()])) with open(os.path.join(dirpath, 'val', 'val_annotations.txt'), 'r') as f: csv_reader = csv.reader(f, delimiter='\t') for index, row in enumerate(csv_reader): filename = row[0] class_id = row[1] image_path = os.path.join(dirpath, 'val', 'images', filename) with open(image_path, 'rb') as f: img = Image.open(f) img = img.convert('RGB') x = F.to_tensor(img).unsqueeze(0) yield x, train_dataset.class_to_idx[class_id]
[docs]class TinyImageNet(AllDataset): """Tiny Imagenet has 200 classes. Each class has 500 training images, 50 validation images, and 50 test images. We have released the training and validation sets with images and annotations. We provide both class labels and bounding boxes as annotations; however, you are asked only to predict the class label of each image without localizing the objects. The test set is released without labels. More at `tiny-imagenet <https://tiny-imagenet.herokuapp.com/>`_. Attributes ---------- classes: List[int] Return the mapping between samples index and their class input_shape: (3, 64, 64) Size of a sample stored in this dataset target_shape: (200,) The dataset is composed of 200 classes train_size: 90000 Size of the train dataset valid_size: 10000 Size of the validation dataset test_size: 10000 Size of the test dataset References ---------- .. [1] Jiayu Wu, Qixiang Zhang, Guoxi Xu. "Tiny ImageNet Challenge", 2017 """ def __init__(self, data_path): build_dataset(data_path) base_transformations = transforms.Compose([ # data is stored as uint8 to_pil_image, transforms.CenterCrop(64), transforms.ToTensor()]) transformations = [ transforms.Normalize( mean=[0.4194, 0.3898, 0.3454], std=[0.303, 0.291, 0.293])] train_transform = [ to_pil_image, transforms.RandomCrop(64, padding=8), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ] + transformations transformations = dict( train=transforms.Compose(train_transform), valid=transforms.Compose(transformations), test=transforms.Compose(transformations)) train_dataset = HDF5Dataset( os.path.join(data_path, TRAIN_FILENAME), base_transformations, transforms.Lambda(lambda x: int(x))) test_dataset = HDF5Dataset( os.path.join(data_path, VAL_FILENAME), base_transformations, transforms.Lambda(lambda x: int(x))) super(TinyImageNet, self).__init__( torch.utils.data.ConcatDataset([train_dataset, test_dataset]), test_size=len(test_dataset), transforms=transformations, target_shape=(200, ), )
[docs] @staticmethod def categories(): return set(['classification'])
builders = { 'tinyimagenet': TinyImageNet}