SunnyMirror
/
DeepLearningExamples
огледало от https://github.com/NVIDIA/DeepLearningExamples.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
							# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" Preprocess dataset and prepare it for training

Example usage:
    $ python preprocess_data.py --input_dir ./src --output_dir ./dst
    --vol_per_file 2

All arguments are listed under `python preprocess_data.py -h`.

"""
import os
import argparse
from random import shuffle
import numpy as np

import nibabel as nib
import tensorflow as tf

PARSER = argparse.ArgumentParser()

PARSER.add_argument('--input_dir', '-i',
                    type=str, help='path to the input directory with data')

PARSER.add_argument('--output_dir', '-o',
                    type=str, help='path to the output directory where tfrecord files will be stored')

PARSER.add_argument('--verbose', '-v', dest='verbose', action='store_true', default=False)

PARSER.add_argument('--vol_per_file', default=4, dest='vol_per_file',
                    type=int, help='how many volumes to pack into a single tfrecord file')

PARSER.add_argument('--single_data_dir', dest='single_data_dir', action='store_true', default=False)


def load_features(path):
    """ Load features from Nifti

    :param path: Path to dataset
    :return: Loaded data
    """
    data = np.zeros((240, 240, 155, 4), dtype=np.uint8)
    name = os.path.basename(path)
    for i, modality in enumerate(["_t1.nii.gz", "_t1ce.nii.gz", "_t2.nii.gz", "_flair.nii.gz"]):
        vol = load_single_nifti(os.path.join(path, name + modality)).astype(np.float32)
        vol[vol > 0.85 * vol.max()] = 0.85 * vol.max()
        vol = 255 * vol / vol.max()
        data[..., i] = vol.astype(np.uint8)

    return data


def load_segmentation(path):
    """ Load segmentations from Nifti

    :param path: Path to dataset
    :return: Loaded data
    """
    path = os.path.join(path, os.path.basename(path)) + "_seg.nii.gz"
    return load_single_nifti(path).astype(np.uint8)


def load_single_nifti(path):
    """ Load Nifti file as numpy

    :param path: Path to file
    :return: Loaded data
    """
    data = nib.load(path).get_fdata().astype(np.int16)
    return np.transpose(data, (1, 0, 2))


def write_to_file(features_list, labels_list, foreground_mean_list, foreground_std_list, output_dir, # pylint: disable=R0913
                  count):
    """ Dump numpy array to tfrecord

    :param features_list: List of features
    :param labels_list:  List of labels
    :param foreground_mean_list: List of means for each volume
    :param foreground_std_list:  List of std for each volume
    :param output_dir: Directory where to write
    :param count: Index of the record
    :return:
    """
    output_filename = os.path.join(output_dir, "volume-{}.tfrecord".format(count))
    filelist = list(zip(np.array(features_list),
                        np.array(labels_list),
                        np.array(foreground_mean_list),
                        np.array(foreground_std_list)))
    np_to_tfrecords(filelist, output_filename)


def np_to_tfrecords(filelist, output_filename):
    """ Convert numpy array to tfrecord

    :param filelist: List of files
    :param output_filename: Destination directory
    """
    writer = tf.io.TFRecordWriter(output_filename)

    for file_item in filelist:
        sample = file_item[0].flatten().tostring()
        label = file_item[1].flatten().tostring()
        mean = file_item[2].astype(np.float32).flatten()
        stdev = file_item[3].astype(np.float32).flatten()

        d_feature = {}
        d_feature['X'] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[sample]))
        d_feature['Y'] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[label]))
        d_feature['mean'] = tf.train.Feature(float_list=tf.train.FloatList(value=mean))
        d_feature['stdev'] = tf.train.Feature(float_list=tf.train.FloatList(value=stdev))

        features = tf.train.Features(feature=d_feature)
        example = tf.train.Example(features=features)
        serialized = example.SerializeToString()
        writer.write(serialized)
    writer.close()


def main():  # pylint: disable=R0914
    """ Starting point of the application"""

    params = PARSER.parse_args()
    input_dir = params.input_dir
    output_dir = params.output_dir
    os.makedirs(params.output_dir, exist_ok=True)

    patient_list = []
    if params.single_data_dir:
        patient_list.extend([os.path.join(input_dir, folder) for folder in os.listdir(input_dir)])
    else:
        assert "HGG" in os.listdir(input_dir) and "LGG" in os.listdir(input_dir), \
            "Data directory has to contain folders named HGG and LGG. " \
            "If you have a single folder with patient's data please set --single_data_dir flag"
        path_hgg = os.path.join(input_dir, "HGG")
        path_lgg = os.path.join(input_dir, "LGG")
        patient_list.extend([os.path.join(path_hgg, folder) for folder in os.listdir(path_hgg)])
        patient_list.extend([os.path.join(path_lgg, folder) for folder in os.listdir(path_lgg)])
    shuffle(patient_list)

    features_list = []
    labels_list = []
    foreground_mean_list = []
    foreground_std_list = []
    count = 0

    total_tfrecord_files = len(patient_list) // params.vol_per_file + (1 if len(patient_list) % params.vol_per_file
                                                                       else 0)
    for i, folder in enumerate(patient_list):

        # Calculate mean and stdev only for foreground voxels
        features = load_features(folder)
        foreground = features > 0
        fg_mean = np.array([(features[..., i][foreground[..., i]]).mean() for i in range(features.shape[-1])])
        fg_std = np.array([(features[..., i][foreground[..., i]]).std() for i in range(features.shape[-1])])

        # BraTS labels are 0,1,2,4 -> switching to 0,1,2,3
        labels = load_segmentation(folder)
        labels[labels == 4] = 3

        features_list.append(features)
        labels_list.append(labels)
        foreground_mean_list.append(fg_mean)
        foreground_std_list.append(fg_std)

        if (i + 1) % params.vol_per_file == 0:
            write_to_file(features_list, labels_list, foreground_mean_list, foreground_std_list, output_dir, count)

            # Clear lists
            features_list = []
            labels_list = []
            foreground_mean_list = []
            foreground_std_list = []
            count += 1

            if params.verbose:
                print("{}/{} tfrecord files created".format(count, total_tfrecord_files))

    # create one more file if there are any remaining unpacked volumes
    if features_list:
        write_to_file(features_list, labels_list, foreground_mean_list, foreground_std_list, output_dir, count)
        count += 1
        if params.verbose:
            print("{}/{} tfrecord files created".format(count, total_tfrecord_files))


if __name__ == '__main__':
    main()