Source code for heat.utils.data._utils

"""
Data utilities module.
This file contains functions which may be useful for certain datatypes, but are not test in the heat framework
This file contains standalone utilities for data preparation which may be useful
The functions contained within are not tested, nor actively supported
"""

import base64
import numpy as np
import os
import struct



[docs]
def dali_tfrecord2idx(train_dir, train_idx_dir, val_dir, val_idx_dir):
    """
    WARNING: This function likely requires adjustments and it is by no means a final product !!!
    this file contains standalone utilities for data preparation which may be useful
    this function contained within are not tested, nor actively supported

    prepare TFRecords indexes for use with DALI. It will produce indexes for all files in the
    given ``train_dir`` and ``val_dir`` directories
    """
    for tv in [train_dir, val_dir]:
        dir_list = os.listdir(tv)
        out = train_idx_dir if tv == train_dir else val_idx_dir
        for file in dir_list:
            with open(file, "rb") as f, open(out + file, "w") as idx:
                while True:
                    current = f.tell()
                    try:
                        # length
                        byte_len = f.read(8)
                        if len(byte_len) == 0:
                            break
                        # crc
                        f.read(4)
                        proto_len = struct.unpack("q", byte_len)[0]
                        # proto
                        f.read(proto_len)
                        # crc
                        f.read(4)
                        idx.write(str(current) + " " + str(f.tell() - current) + "\n")
                    except Exception:
                        print("Not a valid TFRecord file")
                        break




[docs]
def merge_files_imagenet_tfrecord(folder_name, output_folder=None):
    """
    WARNING: This function likely requires adjustments and it is by no means a final product !!!
    this file contains standalone utilities for data preparation which may be useful
    this function contained within are not tested, nor actively supported

    merge multiple preprocessed imagenet TFRecord files together,
    result is one HDF5 file with all of the images stacked in the 0th dimension

    Parameters
    ----------
    folder_name : str, optional*
        folder location of the files to join, either filenames or folder_names must not be None
    output_folder : str, optional
        location to create the output files. Defaults to current directory

    Notes
    -----
    Metadata for both the created files (`imagenet_merged.h5` and `imagenet_merged_validation.h5`):

    The datasets are the combination of all of the images in the Image-net 2012 dataset.
    The data is split into training and validation.

    imagenet_merged.h5 -> training
    imagenet_merged_validation.h5 -> validation

    both files have the same internal structure:
    - file
            * "images" : encoded ASCII string of the decoded RGB JPEG image.
                    - to decode: `torch.as_tensor(bytearray(base64.binascii.a2b_base64(string_repr.encode('ascii'))), dtype=torch.uint8)`
                    - note: the images must be reshaped using: `.reshape(file["metadata"]["image/height"], file["metadata"]["image/height"], 3)`
                            (3 is the number of channels, all images are RGB)
            * "metadata" : the metadata for each image quotes are the titles for each column
                    0. "image/height"
                    1. "image/width"
                    2. "image/channels"
                    3. "image/class/label"
                    4. "image/object/bbox/xmin"
                    5. "image/object/bbox/xmax"
                    6. "image/object/bbox/ymin"
                    7. "image/object/bbox/ymax"
                    8. "image/object/bbox/label"
            * "file_info" : string information related to each image
                    0. "image/format"
                    1. "image/filename"
                    2. "image/class/synset"
                    3. "image/class/text"


    The dataset was created using the preprocessed data from the script:
        https://github.com/tensorflow/models/blob/master/research/inception/inception/data/download_and_preprocess_imagenet.sh

    """
    import h5py
    import tensorflow as tf

    """
    labels:
        image/encoded: string containing JPEG encoded image in RGB colorspace
        image/height: integer, image height in pixels
        image/width: integer, image width in pixels
        image/colorspace: string, specifying the colorspace, always 'RGB'
        image/channels: integer, specifying the number of channels, always 3
        image/format: string, specifying the format, always 'JPEG'
        image/filename: string containing the basename of the image file
                e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
        image/class/label: integer specifying the index in a classification layer.
                The label ranges from [1, 1000] where 0 is not used.
        image/class/synset: string specifying the unique ID of the label, e.g. 'n01440764'
        image/class/text: string specifying the human-readable version of the label
                e.g. 'red fox, Vulpes vulpes'
        image/object/bbox/xmin: list of integers specifying the 0+ human annotated bounding boxes
        image/object/bbox/xmax: list of integers specifying the 0+ human annotated bounding boxes
        image/object/bbox/ymin: list of integers specifying the 0+ human annotated bounding boxes
        image/object/bbox/ymax: list of integers specifying the 0+ human annotated bounding boxes
        image/object/bbox/label: integer specifying the index in a classification
                layer. The label ranges from [1, 1000] where 0 is not used. Note this is
                always identical to the image label."""
    # get the number of files from the contents of the folder
    train_names = [folder_name + f for f in os.listdir(folder_name) if f.startswith("train")].sort()
    val_names = [folder_name + f for f in os.listdir(folder_name) if f.startswith("val")].sort()
    num_train = len(train_names)
    num_val = len(val_names)

    def _find_output_name_and_stsp(num_names):
        start = 0
        stop = num_names + 1
        output_name_lcl = output_folder
        output_name_lcl += "imagenet_merged.h5"
        return start, stop, output_name_lcl

    train_start, train_stop, output_name_lcl_train = _find_output_name_and_stsp(num_train)
    val_start, val_stop, output_name_lcl_val = _find_output_name_and_stsp(num_val)
    output_name_lcl_val = f"{output_name_lcl_val[:-3]}_validation.h5"

    # create the output files
    train_lcl_file = h5py.File(output_name_lcl_train, "w")
    dt = h5py.string_dtype(encoding="ascii")
    train_lcl_file.create_dataset("images", (2502,), chunks=(1251,), maxshape=(None,), dtype=dt)
    train_lcl_file.create_dataset("metadata", (2502, 9), chunks=(1251, 9), maxshape=(None, 9))
    train_lcl_file.create_dataset(
        "file_info", (2502, 4), chunks=(1251, 4), maxshape=(None, 4), dtype="S10"
    )

    val_lcl_file = h5py.File(output_name_lcl_val, "w")
    val_lcl_file.create_dataset("images", (50000,), chunks=True, maxshape=(None,), dtype=dt)
    val_lcl_file.create_dataset("metadata", (50000, 9), chunks=True, maxshape=(None, 9))
    val_lcl_file.create_dataset(
        "file_info", (50000, 4), chunks=True, maxshape=(None, 4), dtype="S10"
    )

    def __single_file_load(src):
        # load a file and read it to a numpy array
        dataset = tf.data.TFRecordDataset(filenames=[src])
        imgs = []
        img_meta = [[] for _ in range(9)]
        file_arr = [[] for _ in range(4)]
        for raw_example in iter(dataset):
            parsed = tf.train.Example.FromString(raw_example.numpy())
            img_str = parsed.features.feature["image/encoded"].bytes_list.value[0]
            img = tf.image.decode_jpeg(img_str, channels=3).numpy()
            string_repr = base64.binascii.b2a_base64(img).decode("ascii")
            imgs.append(string_repr)
            # to decode: np.frombuffer(base64.binascii.a2b_base64(string_repr.encode('ascii')))
            img_meta[0].append(
                tf.cast(
                    parsed.features.feature["image/height"].int64_list.value[0], tf.float32
                ).numpy()
            )
            img_meta[1].append(
                tf.cast(
                    parsed.features.feature["image/width"].int64_list.value[0], tf.float32
                ).numpy()
            )
            img_meta[2].append(
                tf.cast(
                    parsed.features.feature["image/channels"].int64_list.value[0], tf.float32
                ).numpy()
            )
            img_meta[3].append(parsed.features.feature["image/class/label"].int64_list.value[0] - 1)
            try:
                bbxmin = parsed.features.feature["image/object/bbox/xmin"].float_list.value[0]
                bbxmax = parsed.features.feature["image/object/bbox/xmax"].float_list.value[0]
                bbymin = parsed.features.feature["image/object/bbox/ymin"].float_list.value[0]
                bbymax = parsed.features.feature["image/object/bbox/ymax"].float_list.value[0]
                bblabel = parsed.features.feature["image/object/bbox/label"].int64_list.value[0] - 1
            except IndexError:
                bbxmin = 0.0
                bbxmax = img_meta[1][-1]
                bbymin = 0.0
                bbymax = img_meta[0][-1]
                bblabel = -2

            img_meta[4].append(np.float(bbxmin))
            img_meta[5].append(np.float(bbxmax))
            img_meta[6].append(np.float(bbymin))
            img_meta[7].append(np.float(bbymax))
            img_meta[8].append(bblabel)

            file_arr[0].append(parsed.features.feature["image/format"].bytes_list.value[0])
            file_arr[1].append(parsed.features.feature["image/filename"].bytes_list.value[0])
            file_arr[2].append(parsed.features.feature["image/class/synset"].bytes_list.value[0])
            file_arr[3].append(
                np.array(parsed.features.feature["image/class/text"].bytes_list.value[0])
            )
        # need to transpose because of the way that numpy understands nested lists
        img_meta = np.array(img_meta, dtype=np.float64).T
        file_arr = np.array(file_arr).T
        return imgs, img_meta, file_arr

    def __write_datasets(img_outl, img_metal, file_arrl, past_sizel, file):
        file["images"].resize((past_sizel + len(img_outl),))
        file["images"][past_sizel : len(img_outl) + past_sizel] = img_outl
        file["metadata"].resize((past_sizel + img_metal.shape[0], 9))
        file["metadata"][past_sizel : img_metal.shape[0] + past_sizel] = img_metal
        file["file_info"].resize((past_sizel + img_metal.shape[0], 4))
        file["file_info"][past_sizel : img_metal.shape[0] + past_sizel] = file_arrl

    def __load_multiple_files(train_names, train_start, train_stop, file):
        loc_files = train_names[train_start:train_stop]
        img_out, img_meta, file_arr = None, None, None
        past_size, i = 0, 0
        for f in loc_files:  # train
            # print(f)
            # this is where the data is created for
            imgs, img_metaf, file_arrf = __single_file_load(f)
            # create a larger ndarray with the results
            if img_out is not None:
                img_out.extend(imgs)
            else:
                img_out = imgs
            img_meta = np.vstack((img_meta, img_metaf)) if img_meta is not None else img_metaf
            file_arr = np.vstack((file_arr, file_arrf)) if file_arr is not None else file_arrf
            # when 2 files are read, write to the output file
            if i % 2 == 1:
                print(past_size)
                __write_datasets(img_out, img_meta, file_arr, past_size, file)
                past_size += len(img_out)
                img_out, img_meta, file_arr = None, None, None
                del imgs, img_metaf, file_arrf
            i += 1

        if img_out is not None:
            __write_datasets(img_out, img_meta, file_arr, past_size, file)

    __load_multiple_files(train_names, train_start, train_stop, train_lcl_file)
    __load_multiple_files(val_names, val_start, val_stop, val_lcl_file)

    #  add the label names to the datasets
    img_list = [1, 2, 4, 7, 10, 11, 12, 13, 14]
    file_list = [5, 6, 8, 9]
    feature_list = [
        "image/encoded",
        "image/height",
        "image/width",
        "image/colorspace",
        "image/channels",
        "image/format",
        "image/filename",
        "image/class/label",
        "image/class/synset",
        "image/class/text",
        "image/object/bbox/xmin",
        "image/object/bbox/xmax",
        "image/object/bbox/ymin",
        "image/object/bbox/ymax",
        "image/object/bbox/label",
    ]

    train_lcl_file["metadata"].attrs["column_names"] = [feature_list[im] for im in img_list]
    train_lcl_file["file_info"].attrs["column_names"] = [feature_list[im] for im in file_list]
    val_lcl_file["metadata"].attrs["column_names"] = [feature_list[im] for im in img_list]
    val_lcl_file["file_info"].attrs["column_names"] = [feature_list[im] for im in file_list]