Source code for heat.utils.data._utils

"""
Data utilities module.
This file contains functions which may be useful for certain datatypes, but are not test in the heat framework
This file contains standalone utilities for data preparation which may be useful
The functions contained within are not tested, nor actively supported
"""

import base64
import numpy as np
import os
import struct


[docs] def dali_tfrecord2idx(train_dir, train_idx_dir, val_dir, val_idx_dir): """ WARNING: This function likely requires adjustments and it is by no means a final product !!! this file contains standalone utilities for data preparation which may be useful this function contained within are not tested, nor actively supported prepare TFRecords indexes for use with DALI. It will produce indexes for all files in the given ``train_dir`` and ``val_dir`` directories """ for tv in [train_dir, val_dir]: dir_list = os.listdir(tv) out = train_idx_dir if tv == train_dir else val_idx_dir for file in dir_list: with open(file, "rb") as f, open(out + file, "w") as idx: while True: current = f.tell() try: # length byte_len = f.read(8) if len(byte_len) == 0: break # crc f.read(4) proto_len = struct.unpack("q", byte_len)[0] # proto f.read(proto_len) # crc f.read(4) idx.write(str(current) + " " + str(f.tell() - current) + "\n") except Exception: print("Not a valid TFRecord file") break
[docs] def merge_files_imagenet_tfrecord(folder_name, output_folder=None): """ WARNING: This function likely requires adjustments and it is by no means a final product !!! this file contains standalone utilities for data preparation which may be useful this function contained within are not tested, nor actively supported merge multiple preprocessed imagenet TFRecord files together, result is one HDF5 file with all of the images stacked in the 0th dimension Parameters ---------- folder_name : str, optional* folder location of the files to join, either filenames or folder_names must not be None output_folder : str, optional location to create the output files. Defaults to current directory Notes ----- Metadata for both the created files (`imagenet_merged.h5` and `imagenet_merged_validation.h5`): The datasets are the combination of all of the images in the Image-net 2012 dataset. The data is split into training and validation. imagenet_merged.h5 -> training imagenet_merged_validation.h5 -> validation both files have the same internal structure: - file * "images" : encoded ASCII string of the decoded RGB JPEG image. - to decode: `torch.as_tensor(bytearray(base64.binascii.a2b_base64(string_repr.encode('ascii'))), dtype=torch.uint8)` - note: the images must be reshaped using: `.reshape(file["metadata"]["image/height"], file["metadata"]["image/height"], 3)` (3 is the number of channels, all images are RGB) * "metadata" : the metadata for each image quotes are the titles for each column 0. "image/height" 1. "image/width" 2. "image/channels" 3. "image/class/label" 4. "image/object/bbox/xmin" 5. "image/object/bbox/xmax" 6. "image/object/bbox/ymin" 7. "image/object/bbox/ymax" 8. "image/object/bbox/label" * "file_info" : string information related to each image 0. "image/format" 1. "image/filename" 2. "image/class/synset" 3. "image/class/text" The dataset was created using the preprocessed data from the script: https://github.com/tensorflow/models/blob/master/research/inception/inception/data/download_and_preprocess_imagenet.sh """ import h5py import tensorflow as tf """ labels: image/encoded: string containing JPEG encoded image in RGB colorspace image/height: integer, image height in pixels image/width: integer, image width in pixels image/colorspace: string, specifying the colorspace, always 'RGB' image/channels: integer, specifying the number of channels, always 3 image/format: string, specifying the format, always 'JPEG' image/filename: string containing the basename of the image file e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG' image/class/label: integer specifying the index in a classification layer. The label ranges from [1, 1000] where 0 is not used. image/class/synset: string specifying the unique ID of the label, e.g. 'n01440764' image/class/text: string specifying the human-readable version of the label e.g. 'red fox, Vulpes vulpes' image/object/bbox/xmin: list of integers specifying the 0+ human annotated bounding boxes image/object/bbox/xmax: list of integers specifying the 0+ human annotated bounding boxes image/object/bbox/ymin: list of integers specifying the 0+ human annotated bounding boxes image/object/bbox/ymax: list of integers specifying the 0+ human annotated bounding boxes image/object/bbox/label: integer specifying the index in a classification layer. The label ranges from [1, 1000] where 0 is not used. Note this is always identical to the image label.""" # get the number of files from the contents of the folder train_names = [folder_name + f for f in os.listdir(folder_name) if f.startswith("train")].sort() val_names = [folder_name + f for f in os.listdir(folder_name) if f.startswith("val")].sort() num_train = len(train_names) num_val = len(val_names) def _find_output_name_and_stsp(num_names): start = 0 stop = num_names + 1 output_name_lcl = output_folder output_name_lcl += "imagenet_merged.h5" return start, stop, output_name_lcl train_start, train_stop, output_name_lcl_train = _find_output_name_and_stsp(num_train) val_start, val_stop, output_name_lcl_val = _find_output_name_and_stsp(num_val) output_name_lcl_val = f"{output_name_lcl_val[:-3]}_validation.h5" # create the output files train_lcl_file = h5py.File(output_name_lcl_train, "w") dt = h5py.string_dtype(encoding="ascii") train_lcl_file.create_dataset("images", (2502,), chunks=(1251,), maxshape=(None,), dtype=dt) train_lcl_file.create_dataset("metadata", (2502, 9), chunks=(1251, 9), maxshape=(None, 9)) train_lcl_file.create_dataset( "file_info", (2502, 4), chunks=(1251, 4), maxshape=(None, 4), dtype="S10" ) val_lcl_file = h5py.File(output_name_lcl_val, "w") val_lcl_file.create_dataset("images", (50000,), chunks=True, maxshape=(None,), dtype=dt) val_lcl_file.create_dataset("metadata", (50000, 9), chunks=True, maxshape=(None, 9)) val_lcl_file.create_dataset( "file_info", (50000, 4), chunks=True, maxshape=(None, 4), dtype="S10" ) def __single_file_load(src): # load a file and read it to a numpy array dataset = tf.data.TFRecordDataset(filenames=[src]) imgs = [] img_meta = [[] for _ in range(9)] file_arr = [[] for _ in range(4)] for raw_example in iter(dataset): parsed = tf.train.Example.FromString(raw_example.numpy()) img_str = parsed.features.feature["image/encoded"].bytes_list.value[0] img = tf.image.decode_jpeg(img_str, channels=3).numpy() string_repr = base64.binascii.b2a_base64(img).decode("ascii") imgs.append(string_repr) # to decode: np.frombuffer(base64.binascii.a2b_base64(string_repr.encode('ascii'))) img_meta[0].append( tf.cast( parsed.features.feature["image/height"].int64_list.value[0], tf.float32 ).numpy() ) img_meta[1].append( tf.cast( parsed.features.feature["image/width"].int64_list.value[0], tf.float32 ).numpy() ) img_meta[2].append( tf.cast( parsed.features.feature["image/channels"].int64_list.value[0], tf.float32 ).numpy() ) img_meta[3].append(parsed.features.feature["image/class/label"].int64_list.value[0] - 1) try: bbxmin = parsed.features.feature["image/object/bbox/xmin"].float_list.value[0] bbxmax = parsed.features.feature["image/object/bbox/xmax"].float_list.value[0] bbymin = parsed.features.feature["image/object/bbox/ymin"].float_list.value[0] bbymax = parsed.features.feature["image/object/bbox/ymax"].float_list.value[0] bblabel = parsed.features.feature["image/object/bbox/label"].int64_list.value[0] - 1 except IndexError: bbxmin = 0.0 bbxmax = img_meta[1][-1] bbymin = 0.0 bbymax = img_meta[0][-1] bblabel = -2 img_meta[4].append(np.float(bbxmin)) img_meta[5].append(np.float(bbxmax)) img_meta[6].append(np.float(bbymin)) img_meta[7].append(np.float(bbymax)) img_meta[8].append(bblabel) file_arr[0].append(parsed.features.feature["image/format"].bytes_list.value[0]) file_arr[1].append(parsed.features.feature["image/filename"].bytes_list.value[0]) file_arr[2].append(parsed.features.feature["image/class/synset"].bytes_list.value[0]) file_arr[3].append( np.array(parsed.features.feature["image/class/text"].bytes_list.value[0]) ) # need to transpose because of the way that numpy understands nested lists img_meta = np.array(img_meta, dtype=np.float64).T file_arr = np.array(file_arr).T return imgs, img_meta, file_arr def __write_datasets(img_outl, img_metal, file_arrl, past_sizel, file): file["images"].resize((past_sizel + len(img_outl),)) file["images"][past_sizel : len(img_outl) + past_sizel] = img_outl file["metadata"].resize((past_sizel + img_metal.shape[0], 9)) file["metadata"][past_sizel : img_metal.shape[0] + past_sizel] = img_metal file["file_info"].resize((past_sizel + img_metal.shape[0], 4)) file["file_info"][past_sizel : img_metal.shape[0] + past_sizel] = file_arrl def __load_multiple_files(train_names, train_start, train_stop, file): loc_files = train_names[train_start:train_stop] img_out, img_meta, file_arr = None, None, None past_size, i = 0, 0 for f in loc_files: # train # print(f) # this is where the data is created for imgs, img_metaf, file_arrf = __single_file_load(f) # create a larger ndarray with the results if img_out is not None: img_out.extend(imgs) else: img_out = imgs img_meta = np.vstack((img_meta, img_metaf)) if img_meta is not None else img_metaf file_arr = np.vstack((file_arr, file_arrf)) if file_arr is not None else file_arrf # when 2 files are read, write to the output file if i % 2 == 1: print(past_size) __write_datasets(img_out, img_meta, file_arr, past_size, file) past_size += len(img_out) img_out, img_meta, file_arr = None, None, None del imgs, img_metaf, file_arrf i += 1 if img_out is not None: __write_datasets(img_out, img_meta, file_arr, past_size, file) __load_multiple_files(train_names, train_start, train_stop, train_lcl_file) __load_multiple_files(val_names, val_start, val_stop, val_lcl_file) # add the label names to the datasets img_list = [1, 2, 4, 7, 10, 11, 12, 13, 14] file_list = [5, 6, 8, 9] feature_list = [ "image/encoded", "image/height", "image/width", "image/colorspace", "image/channels", "image/format", "image/filename", "image/class/label", "image/class/synset", "image/class/text", "image/object/bbox/xmin", "image/object/bbox/xmax", "image/object/bbox/ymin", "image/object/bbox/ymax", "image/object/bbox/label", ] train_lcl_file["metadata"].attrs["column_names"] = [feature_list[im] for im in img_list] train_lcl_file["file_info"].attrs["column_names"] = [feature_list[im] for im in file_list] val_lcl_file["metadata"].attrs["column_names"] = [feature_list[im] for im in img_list] val_lcl_file["file_info"].attrs["column_names"] = [feature_list[im] for im in file_list]