Source code for nutsml.reader

"""
.. module:: reader
   :synopsis: Reading of sample data and images
"""
from __future__ import absolute_import

import os

import pandas as pd
import numpy as np

from glob import glob
from collections import namedtuple
from fnmatch import fnmatch
from nutsml.imageutil import load_image
from nutsml.fileutil import reader_filepath
from nutsflow import NutSource, nut_function, nut_source
from nutsflow.common import as_set


[docs]@nut_source
def ReadLabelDirs(basedir, filepattern='*', exclude='_*'):
    """
    Read file paths from label directories.

    Typically used when classification data is organized in folders,
    where the folder name represents the class label and the files in
    the folder the data samples (images, documents, ...) for that class.

    >>> from __future__ import print_function
    >>> from nutsflow import Sort

    >>> read = ReadLabelDirs('tests/data/labeldirs', '*.txt')
    >>> samples = read >> Sort()
    >>> for sample in samples:
    ...     print(sample)
    ...
    ('tests/data/labeldirs/0/test0.txt', '0')
    ('tests/data/labeldirs/1/test1.txt', '1')
    ('tests/data/labeldirs/1/test11.txt', '1')

    :param string basedir: Path to folder that contains label directories.
    :param string filepattern: Pattern for filepaths to read from
           label directories, e.g. '*.jpg', '*.txt'
    :param string exclude: Pattern for label directories to exclude.
           Default is '_*' which excludes all label folders prefixed with '_'.
    :return: iterator over labeled file paths
    :rtype: iterator
    """
    for label in os.listdir(basedir):
        if os.path.isdir(os.path.join(basedir, label)):
            if fnmatch(label, exclude):
                continue
            pathname = os.path.join(basedir, label, filepattern)
            for filepath in glob(pathname):
                yield filepath.replace('\\', '/'), label


[docs]@nut_function
def ReadNumpy(sample, columns, pathfunc=None, allow_pickle=False):
    """
    Load numpy arrays from filesystem.

    Note that the loaded numpy array replace the file name|path in the
    sample.

    >>> from nutsflow import Consume, Collect, PrintType

    >>> samples = ['tests/data/img_arrays/nut_color.jpg.npy']
    >>> samples >> ReadNumpy(None) >> PrintType() >> Consume()
    (<ndarray> 213x320x3:uint8)

    >>> samples = [('tests/data/img_arrays/nut_color.jpg.npy', 'class0')]
    >>> samples >> ReadNumpy(0) >> PrintType() >> Consume()
    (<ndarray> 213x320x3:uint8, <str> class0)

    >>> filepath = 'tests/data/img_arrays/*.jpg.npy'
    >>> samples = [(1, 'nut_color'), (2, 'nut_grayscale')]
    >>> samples >> ReadNumpy(1, filepath) >> PrintType() >> Consume()
    (<int> 1, <ndarray> 213x320x3:uint8)
    (<int> 2, <ndarray> 213x320:uint8)

    >>> pathfunc = lambda s: 'tests/data/img_arrays/{1}.jpg.npy'.format(*s)
    >>> samples >> ReadNumpy(1, pathfunc) >> PrintType() >> Consume()
    (<int> 1, <ndarray> 213x320x3:uint8)
    (<int> 2, <ndarray> 213x320:uint8)

    :param tuple|list sample: ('nut_data', 1)
    :param None|int|tuple columns: Indices of columns in sample to be replaced
                              by numpy array (based on fileid in that column)
                              If None then a flat samples is assumed and
                              a tuple with the numpy array is returned.
    :param string|function|None pathfunc: Filepath with wildcard '*',
      which is replaced by the file id/name provided in the sample, e.g.
      'tests/data/img_arrays/*.jpg.npy' for sample ('nut_grayscale', 2)
      will become 'tests/data/img_arrays/nut_grayscale.jpg.npy'
      or
      Function to compute path to numnpy file from sample, e.g.
      lambda sample: 'tests/data/img_arrays/{1}.jpg.npy'.format(*sample)
      or
      None, in this case the file id/name is taken as the filepath.
    :param bool allow_pickle : Allow loading pickled object arrays in npy files.
    :return: Sample with file ids/names replaced by numpy arrays.
    :rtype: tuple
    """

    def load(filename):
        """Load numpy array for given fileid"""
        filepath = reader_filepath(sample, filename, pathfunc)
        return np.load(filepath, allow_pickle=allow_pickle)

    if columns is None:
        return (load(sample),)  # numpy array as tuple with one element

    colset = as_set(columns)
    elems = enumerate(sample)
    return tuple(load(e) if i in colset else e for i, e in elems)


[docs]@nut_function
def ReadImage(sample, columns, pathfunc=None, as_grey=False, dtype='uint8'):
    """
    Load images from filesystem for samples.

    Loads images in jpg, gif, png, tif and bmp format.
    Images are returned as numpy arrays of shape (h, w, c) or (h, w) for
    color images or gray scale images respectively.
    See nutsml.imageutil.load_image for details.

    Note that the loaded images replace the image file name|path in the
    sample. If the images file paths are directly proved (not as a tuple
    sample) still tuples with the loaded image are returned.
    
    >>> from nutsflow import Consume, Collect
    >>> from nutsml import PrintColType

    >>> images = ['tests/data/img_formats/nut_color.gif']
    >>> images >> ReadImage(None) >> PrintColType() >> Consume()
    item 0: <tuple>
      0: <ndarray> shape:213x320x3 dtype:uint8 range:0..255

    >>> samples = [('tests/data/img_formats/nut_color.gif', 'class0')]
    >>> img_samples = samples >> ReadImage(0) >> Collect()

    >>> imagepath = 'tests/data/img_formats/*.gif'
    >>> samples = [(1, 'nut_color'), (2, 'nut_grayscale')]
    >>> samples >> ReadImage(1, imagepath) >> PrintColType() >> Consume()
    item 0: <tuple>
      0: <int> 1
      1: <ndarray> shape:213x320x3 dtype:uint8 range:0..255
    item 1: <tuple>
      0: <int> 2
      1: <ndarray> shape:213x320 dtype:uint8 range:20..235

    >>> pathfunc = lambda s: 'tests/data/img_formats/{1}.jpg'.format(*s)
    >>> img_samples = samples >> ReadImage(1, pathfunc) >> Collect()

    :param tuple|list sample: ('nut_color', 1)
    :param None|int|tuple columns: Indices of columns in sample to be replaced
                              by image (based on image id in that column)
                              If None then a flat samples is assumed and
                              a tuple with the image is returned.
    :param string|function|None pathfunc: Filepath with wildcard '*',
      which is replaced by the imageid provided in the sample, e.g.
      'tests/data/img_formats/*.jpg' for sample ('nut_grayscale', 2)
      will become 'tests/data/img_formats/nut_grayscale.jpg'
      or
      Function to compute path to image file from sample, e.g.
      lambda sample: 'tests/data/img_formats/{1}.jpg'.format(*sample)
      or
      None, in this case the image id is taken as the filepath.
    :param bool as_grey: If true, load as grayscale image.
    :param dtype dtype: Numpy data type of the image.
    :return: Sample with image ids replaced by image (=ndarray)
            of shape (h, w, c) or (h, w)
    :rtype: tuple
    """

    def load(filename):
        """Load image for given fileid"""
        filepath = reader_filepath(sample, filename, pathfunc)
        return load_image(filepath, as_grey=as_grey, dtype=dtype)

    if columns is None:
        return (load(sample),)  # image as tuple with one element

    colset = as_set(columns)
    elems = enumerate(sample)
    return tuple(load(e) if i in colset else e for i, e in elems)


[docs]class ReadPandas(NutSource):
    """
    Read data as Pandas table from file system.
    """

[docs]    def __init__(self, filepath, rows=None, colnames=None, dropnan=True,
                 replacenan=False, rowname='Row', **kwargs):
        """
        Create reader for Pandas tables.

        The reader returns the table contents as an interator over named tuples,
        where the column names are derived from the table columns. The order
        and selection of columns can be changed.

        >>> from nutsflow import Collect, Consume, Print
        >>> filepath = 'tests/data/pandas_table.csv'

        >>> ReadPandas(filepath) >> Print() >> Consume()
        Row(col1=1.0, col2=4.0)
        Row(col1=3.0, col2=6.0)

        >>> (ReadPandas(filepath, dropnan=False, rowname='Sample') >>
        ... Print() >> Consume())
        Sample(col1=1.0, col2=4.0)
        Sample(col1=2.0, col2=nan)
        Sample(col1=3.0, col2=6.0)

        >>> ReadPandas(filepath, replacenan=None) >> Print() >> Consume()
        Row(col1=1.0, col2=4.0)
        Row(col1=2.0, col2=None)
        Row(col1=3.0, col2=6.0)

        >>> colnames=['col2', 'col1']   # swap order
        >>> ReadPandas(filepath, colnames=colnames) >> Print() >> Consume()
        Row(col2=4.0, col1=1.0)
        Row(col2=6.0, col1=3.0)

        >>> ReadPandas(filepath, rows='col1 > 1', replacenan=0) >> Collect()
        [Row(col1=2.0, col2=0), Row(col1=3.0, col2=6.0)]

        :param str filepath: Path to a table in CSV, TSV, XLSX or
          Pandas pickle format. Depending on file extension (e.g. .csv)
          the table format is picked.
          Note tables must have a header with the column names.
        :param str rows: Rows to filter. Any Pandas filter expression. If
          rows = None all rows of the table are returned.
        :param list columns: List of names for the table columns to return.
          For columns = None all columns are returned.
        :param bool dropnan: If True all rows that contain NaN are dropped.
        :param object replacenan: If not False all NaNs are replaced by
             the value of replacenan
        :param str rowname: Name of named tuple return as rows.
        :param kwargs kwargs: Key word arguments passed on the the Pandas
          methods for data reading, e.g, header=None.
          See pandas/pandas/io/parsers.py for detais

        """
        self.filepath = filepath
        self.rows = rows
        self.colnames = colnames
        self.dropnan = dropnan
        self.replacenan = replacenan
        self.rowname = rowname
        self.kwargs = kwargs
        self.dataframe = self._load_table(filepath)

[docs]    @staticmethod
    def isnull(value):
        """
        Return true if values is NaN or None.

        >>> import numpy as np
        >>> ReadPandas.isnull(np.NaN)
        True

        >>> ReadPandas.isnull(None)
        True

        >>> ReadPandas.isnull(0)
        False

        :param value: Value to test
        :return: Return true for NaN or None values.
        :rtype: bool
        """
        return pd.isnull(value)

    def _replacenan(self, row):
        """
        Replace NaN values in row by None

        :param iterable row: Any iterable.
        :return: Row with None instead of NaN
        :rtype: tuple
        """
        value = self.replacenan
        return tuple(value if pd.isnull(v) else v for v in row)

    def _load_table(self, filepath):
        """
        Load table from file system.

        :param str filepath: Path to table in CSV, TSV, XLSX or
                   Pandas pickle format.
        :return: Pandas table
        :rtype: pandas.core.frame.DataFrame
        """
        _, ext = os.path.splitext(filepath.lower())
        if ext == '.tsv':
            return pd.read_csv(filepath, sep='\t', **self.kwargs)
        if ext == '.csv':
            return pd.read_csv(filepath, **self.kwargs)
        if ext == '.xlsx':
            return pd.read_excel(filepath, engine='openpyxl', **self.kwargs)
        return pd.read_pickle(filepath, **self.kwargs)

    def __iter__(self):
        """
        Return iterator over rows in table.

        :return: Iterator over rows.
        :rtype: iterator
        """
        df = self.dataframe
        rows = df.query(self.rows) if self.rows else df
        series = rows[self.colnames] if self.colnames else rows
        Row = namedtuple(self.rowname, series.columns.to_list())

        if not self.replacenan is False:
            values = (self._replacenan(row) for row in series.values)
        elif self.dropnan:
            values = series.dropna().values
        else:
            values = series.values
        return (Row(*v) for v in values)