Source code for nutsml.batcher

"""
.. module:: batcher
   :synopsis: Collecting samples in mini-batches for GPU-based training.
"""
import warnings
import numpy as np
import nutsml.imageutil as ni

from nutsflow import nut_function
from nutsflow.base import Nut
from nutsflow.iterfunction import take, PrefetchIterator


[docs]def build_number_batch(numbers, dtype): """ Return numpy array with given dtype for given numbers. >>> numbers = (1, 2, 3, 1) >>> build_number_batch(numbers, 'uint8') array([1, 2, 3, 1], dtype=uint8) :param iterable number numbers: Numbers to create batch from :param numpy data type dtype: Data type of batch, e.g. 'uint8' :return: Numpy array for numbers :rtype: numpy.array """ return np.array(numbers, dtype=dtype)
[docs]def build_one_hot_batch(class_ids, dtype, num_classes): """ Return one hot vectors for class ids. >>> class_ids = [0, 1, 2, 1] >>> build_one_hot_batch(class_ids, 'uint8', 3) array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]], dtype=uint8) :param iterable class_ids: Class indices in {0, ..., num_classes-1} :param numpy data type dtype: Data type of batch, e.g. 'uint8' :param num_classes: Number of classes :return: One hot vectors for class ids. :rtype: numpy.array """ class_ids = np.array(class_ids, dtype=np.uint16) return np.eye(num_classes, dtype=dtype)[class_ids]
[docs]def build_vector_batch(vectors, dtype): """ Return batch of vectors. >>> from nutsflow.common import shapestr >>> vectors = [np.array([1,2,3]), np.array([2, 3, 4])] >>> batch = build_vector_batch(vectors, 'uint8') >>> shapestr(batch) '2x3' >>> batch array([[1, 2, 3], [2, 3, 4]], dtype=uint8) :param iterable vectors: Numpy row vectors :param numpy data type dtype: Data type of batch, e.g. 'uint8' :return: vstack of vectors :rtype: numpy.array """ if not len(vectors): raise ValueError('No vectors to build batch!') return np.vstack(vectors).astype(dtype)
[docs]def build_tensor_batch(tensors, dtype, axes=None, expand=None): """ Return batch of tensors. >>> from nutsflow.common import shapestr >>> tensors = [np.zeros((2, 3)), np.ones((2, 3))] >>> batch = build_tensor_batch(tensors, 'uint8') >>> shapestr(batch) '2x2x3' >>> print(batch) [[[0 0 0] [0 0 0]] <BLANKLINE> [[1 1 1] [1 1 1]]] >>> batch = build_tensor_batch(tensors, 'uint8', expand=0) >>> shapestr(batch) '2x1x2x3' >>> print(batch) [[[[0 0 0] [0 0 0]]] <BLANKLINE> [[[1 1 1] [1 1 1]]]] >>> batch = build_tensor_batch(tensors, 'uint8', axes=(1, 0)) >>> shapestr(batch) '2x3x2' >>> print(batch) [[[0 0] [0 0] [0 0]] <BLANKLINE> [[1 1] [1 1] [1 1]]] :param iterable tensors: Numpy tensors :param numpy data type dtype: Data type of batch, e.g. 'uint8' :param tuple|None axes: axes order, e.g. to move a channel axis to the last position. (see numpy transpose for details) :param int|None expand: Add empty dimension at expand dimension. (see numpy expand_dims for details). :return: stack of tensors, with batch axis first. :rtype: numpy.array """ if not len(tensors): raise ValueError('No tensors to build batch!') if axes is not None: tensors = [np.transpose(t, axes) for t in tensors] if expand is not None: tensors = [np.expand_dims(t, expand) for t in tensors] return np.stack(tensors).astype(dtype)
[docs]def build_image_batch(images, dtype, channelfirst=False): """ Return batch of images. If images have no channel a channel axis is added. For channelfirst=True it will be added/moved to front otherwise the channel comes last. All images in batch will have a channel axis. Batch is of shape (n, c, h, w) or (n, h, w, c) depending on channelfirst, where n is the number of images in the batch. >>> from nutsflow.common import shapestr >>> images = [np.zeros((2, 3)), np.ones((2, 3))] >>> batch = build_image_batch(images, 'uint8', True) >>> shapestr(batch) '2x1x2x3' >>> batch array([[[[0, 0, 0], [0, 0, 0]]], <BLANKLINE> <BLANKLINE> [[[1, 1, 1], [1, 1, 1]]]], dtype=uint8) :param numpy array images: Images to batch. Must be of shape (w,h,c) or (w,h). Gray-scale with channel is fine (w,h,1) and also alpha channel is fine (w,h,4). :param numpy data type dtype: Data type of batch, e.g. 'uint8' :param bool channelfirst: If True, channel is added/moved to front. :return: Image batch with shape (n, c, h, w) or (n, h, w, c). :rtype: np.array """ def _targetshape(image): shape = image.shape return (shape[0], shape[1], 1) if image.ndim == 2 else shape n = len(images) if not n: raise ValueError('No images to build batch!') h, w, c = _targetshape(images[0]) # shape of first(=all) images if c > w or c > h: raise ValueError('Channel not at last axis: ' + str((h, w, c))) batch = np.empty((n, c, h, w) if channelfirst else (n, h, w, c)) for i, image in enumerate(images): image = ni.add_channel(image, channelfirst) if image.shape != batch.shape[1:]: raise ValueError('Images vary in shape: ' + str(image.shape)) batch[i, :, :, :] = image return batch.astype(dtype)
[docs]class BuildBatch(Nut): """ Build batches for GPU-based neural network training. """
[docs] def __init__(self, batchsize, prefetch=1): """ iterable >> BuildBatch(batchsize, prefetch=1) Take samples in iterable, extract specified columns, convert column data to numpy arrays of various types, aggregate converted samples into a batch. The format of a batch is a list of lists: [[inputs], [outputs]] where inputs and outputs are Numpy arrays. The following example uses PrintType() to print the shape of the batches constructed. This is useful for development and debugging but should be removed in production. >>> from nutsflow import Collect, PrintType >>> numbers = [4.1, 3.2, 1.1] >>> images = [np.zeros((5, 3)), np.ones((5, 3)) , np.ones((5, 3))] >>> class_ids = [1, 2, 1] >>> samples = list(zip(numbers, images, class_ids)) >>> build_batch = (BuildBatch(batchsize=2) ... .input(0, 'number', 'float32') ... .input(1, 'image', np.uint8, True) ... .output(2, 'one_hot', np.uint8, 3)) >>> batches = samples >> build_batch >> PrintType() >> Collect() [[<ndarray> 2:float32, <ndarray> 2x1x5x3:uint8], [<ndarray> 2x3:uint8]] [[<ndarray> 1:float32, <ndarray> 1x1x5x3:uint8], [<ndarray> 1x3:uint8]] In the example above, we have multiple inputs and a single output, and the batch is of format [[number, image], [one_hot]], where each data element a Numpy array with the shown shape and dtype. Sample columns can be ignored or reused. Assuming an autoencoder, one might whish to reuse the sample image as input and output: >>> build_batch = (BuildBatch(2) ... .input(1, 'image', np.uint8, True) ... .output(1, 'image', np.uint8, True)) >>> batches = samples >> build_batch >> PrintType() >> Collect() [[<ndarray> 2x1x5x3:uint8], [<ndarray> 2x1x5x3:uint8]] [[<ndarray> 1x1x5x3:uint8], [<ndarray> 1x1x5x3:uint8]] In the prediction phase no target outputs are needed. If the batch contains only inputs, the batch format is just [inputs]. >>> build_pred_batch = (BuildBatch(2) ... .input(1, 'image', 'uint8', True)) >>> batches = samples >> build_pred_batch >> PrintType() >> Collect() [<ndarray> 2x1x5x3:uint8] [<ndarray> 1x1x5x3:uint8] :param int batchsize: Size of batch = number of rows in batch. :param int prefetch: Number of batches to prefetch. This speeds up GPU based training, since one batch is built on CPU while the another is processed on the GPU. Note: if verbose=True, prefetch is set to 0 to simplify debugging. :param bool verbose: Print batch shape when True. (and sets prefetch=0) """ self.batchsize = batchsize self.prefetch = prefetch self.colspecs = [] self.builder = {'image': build_image_batch, 'number': build_number_batch, 'vector': build_vector_batch, 'tensor': build_tensor_batch, 'one_hot': build_one_hot_batch}
[docs] def input(self, col, name, *args, **kwargs): """ Specify and add input columns for batch to create :param int col: column of the sample to extract and to create a batch input column from. :param string name: Name of the column function to apply to create a batch column, e.g. 'image' See the following functions for more details: 'image': nutsflow.batcher.build_image_batch 'number': nutsflow.batcher.build_number_batch 'vector': nutsflow.batcher.build_vector_batch 'tensor': nutsflow.batcher.build_tensor_batch 'one_hot': nutsflow.batcher.build_one_hot_batch :param args args: Arguments for column function, e.g. dtype :param kwargs kwargs: Keyword arguments for column function :return: instance of BuildBatch :rtype: BuildBatch """ self.colspecs.append((col, name, True, args, kwargs)) return self
[docs] def output(self, col, name, *args, **kwargs): """ Specify and add output columns for batch to create :param int col: column of the sample to extract and to create a batch output column from. :param string name: Name of the column function to apply to create a batch column, e.g. 'image' See the following functions for more details: 'image': nutsflow.batcher.build_image_batch 'number': nutsflow.batcher.build_number_batch 'vector': nutsflow.batcher.build_vector_batch 'tensor': nutsflow.batcher.build_tensor_batch 'one_hot': nutsflow.batcher.build_one_hot_batch :param args args: Arguments for column function, e.g. dtype :param kwargs kwargs: Keyword arguments for column function :return: instance of BuildBatch :rtype: BuildBatch """ self.colspecs.append((col, name, False, args, kwargs)) return self
def _batch_generator(self, iterable): """Return generator over batches for given iterable of samples""" while 1: batchsamples = list(take(iterable, self.batchsize)) if not batchsamples: break cols = list(zip(*batchsamples)) # flip rows to cols batch = [[], []] # in, out columns of batch for colspec in self.colspecs: col, func, isinput, args, kwargs = colspec if not func in self.builder: raise ValueError('Invalid builder: ' + func) coldata = self.builder[func](cols[col], *args, **kwargs) batch[0 if isinput else 1].append(coldata) if not batch[1]: # no output (prediction phase) batch = batch[0] # flatten and take only inputs yield batch
[docs] def __rrshift__(self, iterable): """ Convert samples in iterable into mini-batches. Structure of output depends on fmt function used. If None output is a list of np.arrays :param iterable iterable: Iterable over samples. :return: Mini-batches :rtype: list of np.array if fmt=None """ batch_gen = self._batch_generator(iter(iterable)) if self.prefetch: batch_gen = PrefetchIterator(batch_gen, self.prefetch) return batch_gen
[docs]@nut_function def Mixup(batch, alpha): """ Mixup produces random interpolations between data and labels. Usage: ... >> BuildBatch() >> Mixup(0.1) >> network.train() >> ... Implementation based on the following paper: mixup: Beyond Empirical Risk Minimization https://arxiv.org/abs/1710.09412 :param list batch: Batch consisting of list of input data and list of output data, where data must be numeric, e.g. images and one-hot-encoded class labels that can be interpolated between. :param float alpha: Control parameter for beta distribution the interpolation factors are sampled from. Range: [0,...,1] For alpha <= 0 no mixup is performed. :return: """ if alpha <= 0: return batch ri = np.arange(len(batch[0][0])) np.random.shuffle(ri) lam = np.random.beta(alpha, alpha) mixup = lambda data: lam * data + (1 - lam) * data[ri] inputs = [mixup(i) for i in batch[0]] outputs = [mixup(o) for o in batch[1]] return [inputs, outputs]