SunnyMirror
/
DeepLearningExamples
同期ミラー https://github.com/NVIDIA/DeepLearningExamples.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
							# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ctypes
import time
import logging

import numpy as np
import torch

# DALI imports
import nvidia.dali as dali
from nvidia.dali.pipeline import Pipeline
from nvidia.dali.types import to_numpy_type


class COCOPipeline(Pipeline):
    def __init__(self, batch_size, file_root, annotations_file, default_boxes,
                 device_id, num_shards,
                 output_fp16=False, output_nhwc=False, pad_output=False,
                 num_threads=1, seed=15):
        super(COCOPipeline, self).__init__(batch_size=batch_size,
                                           device_id=device_id,
                                           num_threads=num_threads,
                                           seed=seed)

        if torch.distributed.is_initialized():
            shard_id = torch.distributed.get_rank()
        else:
            shard_id = 0

        # Data loader and image decoder
        self.input = dali.ops.readers.COCO(file_root=file_root,
                                           annotations_file=annotations_file,
                                           shard_id=shard_id,
                                           num_shards=num_shards,
                                           ratio=True,
                                           ltrb=True,
                                           shuffle_after_epoch=True,
                                           skip_empty=True)
        self.decode_slice = dali.ops.decoders.ImageSlice(device="cpu",
                                                         output_type=dali.types.RGB)

        # Augumentation techniques
        ## Random crop
        self.crop = dali.ops.RandomBBoxCrop(device="cpu",
                                            aspect_ratio=[0.5, 2.0],
                                            thresholds=[0, 0.1, 0.3, 0.5, 0.7, 0.9],
                                            scaling=[0.3, 1.0],
                                            bbox_layout="xyXY",
                                            allow_no_crop=True,
                                            num_attempts=1)
        ## Color twist
        self.hsv = dali.ops.Hsv(device="gpu",
                                dtype=dali.types.FLOAT)  # use float to avoid clipping and quantizing the intermediate result
        self.bc = dali.ops.BrightnessContrast(device="gpu",
                                              contrast_center=128,  # input is in the [0, 255] range
                                              dtype=dali.types.UINT8)
        ## Cropping and normalization
        dtype = dali.types.FLOAT16 if output_fp16 else dali.types.FLOAT
        output_layout = dali.types.NHWC if output_nhwc else dali.types.NCHW
        self.normalize = dali.ops.CropMirrorNormalize(
            device="gpu",
            crop=(300, 300),
            mean=[0.0, 0.0, 0.0],
            std=[255.0, 255.0, 255.0],
            mirror=0,
            dtype=dtype,
            output_layout=output_layout,
            pad_output=pad_output)
        ## Flipping
        self.flip = dali.ops.Flip(device="cpu")
        self.bbflip = dali.ops.BbFlip(device="cpu", ltrb=True)

        # Resize
        self.resize = dali.ops.Resize(device="cpu",
                                      resize_x=300,
                                      resize_y=300)

        # Random variables
        self.rng1 = dali.ops.random.Uniform(range=[0.5, 1.5])
        self.rng2 = dali.ops.random.Uniform(range=[0.875, 1.125])
        self.rng3 = dali.ops.random.Uniform(range=[-0.5, 0.5])
        self.flip_coin = dali.ops.random.CoinFlip(probability=0.5)

        # bbox encoder
        self.anchors = default_boxes(order='ltrb').cpu().numpy().flatten().tolist()
        self.box_encoder = dali.ops.BoxEncoder(device="cpu",
                                               criteria=0.5,
                                               anchors=self.anchors)

    def define_graph(self):
        saturation = self.rng1()
        contrast = self.rng1()
        brightness = self.rng2()
        hue = self.rng3()
        coin_rnd = self.flip_coin()

        inputs, bboxes, labels = self.input(name="Reader")
        crop_begin, crop_size, bboxes, labels = self.crop(bboxes, labels)
        images = self.decode_slice(inputs, crop_begin, crop_size)

        images = self.flip(images, horizontal=coin_rnd)
        bboxes = self.bbflip(bboxes, horizontal=coin_rnd)
        images = self.resize(images)
        images = images.gpu()

        images = self.hsv(images, hue=hue, saturation=saturation)
        images = self.bc(images, brightness=brightness, contrast=contrast)

        images = self.normalize(images)
        bboxes, labels = self.box_encoder(bboxes, labels)

        # bboxes and images and labels on GPU
        return (images, bboxes.gpu(), labels.gpu())

to_torch_type = {
    np.float32 : torch.float32,
    np.float64 : torch.float64,
    np.float16 : torch.float16,
    np.uint8   : torch.uint8,
    np.int8    : torch.int8,
    np.int16   : torch.int16,
    np.int32   : torch.int32,
    np.int64   : torch.int64
}

def feed_ndarray(dali_tensor, arr):
    """
    Copy contents of DALI tensor to pyTorch's Tensor.

    Parameters
    ----------
    `dali_tensor` : nvidia.dali.backend.TensorCPU or nvidia.dali.backend.TensorGPU
                    Tensor from which to copy
    `arr` : torch.Tensor
            Destination of the copy
    """
    assert dali_tensor.shape() == list(arr.size()), \
            ("Shapes do not match: DALI tensor has size {0}"
            ", but PyTorch Tensor has size {1}".format(dali_tensor.shape(), list(arr.size())))
    #turn raw int to a c void pointer
    c_type_pointer = ctypes.c_void_p(arr.data_ptr())
    dali_tensor.copy_to_external(c_type_pointer)
    return arr

class DALICOCOIterator(object):
    """
    COCO DALI iterator for pyTorch.

    Parameters
    ----------
    pipelines : list of nvidia.dali.pipeline.Pipeline
                List of pipelines to use
    size : int
           Epoch size.
    """
    def __init__(self, pipelines, size):
        if not isinstance(pipelines, list):
            pipelines = [pipelines]

        self._num_gpus = len(pipelines)
        assert pipelines is not None, "Number of provided pipelines has to be at least 1"
        self.batch_size = pipelines[0].max_batch_size
        self._size = size
        self._pipes = pipelines

        # Build all pipelines
        for p in self._pipes:
            p.build()

        # Use double-buffering of data batches
        self._data_batches = [[None, None, None, None] for i in range(self._num_gpus)]
        self._counter = 0
        self._current_data_batch = 0
        self.output_map = ["image", "bboxes", "labels"]

        # We need data about the batches (like shape information),
        # so we need to run a single batch as part of setup to get that info
        self._first_batch = None
        self._first_batch = self.next()

    def __next__(self):
        if self._first_batch is not None:
            batch = self._first_batch
            self._first_batch = None
            return batch
        if self._counter > self._size:
            raise StopIteration

        # Gather outputs
        outputs = []
        for p in self._pipes:
            p._prefetch()
        for p in self._pipes:
            outputs.append(p.share_outputs())
        for i in range(self._num_gpus):
            dev_id = self._pipes[i].device_id
            out_images = []
            bboxes = []
            labels = []
            # segregate outputs into image/labels/bboxes entries
            for j, out in enumerate(outputs[i]):
                if self.output_map[j] == "image":
                    out_images.append(out)
                elif self.output_map[j] == "bboxes":
                    bboxes.append(out)
                elif self.output_map[j] == "labels":
                    labels.append(out)

            # Change DALI TensorLists into Tensors
            images = [x.as_tensor() for x in out_images]
            images_shape = [x.shape() for x in images]

            # Prepare bboxes shapes
            bboxes_shape = []
            for j in range(len(bboxes)):
                bboxes_shape.append([])
                for k in range(len(bboxes[j])):
                    bboxes_shape[j].append(bboxes[j][k].shape())

            # Prepare labels shapes and offsets
            labels_shape = []
            bbox_offsets = []

            torch.cuda.synchronize()
            for j in range(len(labels)):
                labels_shape.append([])
                bbox_offsets.append([0])
                for k in range(len(labels[j])):
                    lshape = labels[j][k].shape()
                    bbox_offsets[j].append(bbox_offsets[j][k] + lshape[0])
                    labels_shape[j].append(lshape)

            # We always need to alocate new memory as bboxes and labels varies in shape
            images_torch_type = to_torch_type[to_numpy_type(images[0].dtype)]
            bboxes_torch_type = to_torch_type[to_numpy_type(bboxes[0][0].dtype)]
            labels_torch_type = to_torch_type[to_numpy_type(labels[0][0].dtype)]

            torch_gpu_device = torch.device('cuda', dev_id)
            torch_cpu_device = torch.device('cpu')

            pyt_images = [torch.zeros(shape, dtype=images_torch_type, device=torch_gpu_device) for shape in images_shape]
            pyt_bboxes = [[torch.zeros(shape, dtype=bboxes_torch_type, device=torch_gpu_device) for shape in shape_list] for shape_list in bboxes_shape]
            pyt_labels = [[torch.zeros(shape, dtype=labels_torch_type, device=torch_gpu_device) for shape in shape_list] for shape_list in labels_shape]
            pyt_offsets = [torch.zeros(len(offset), dtype=torch.int32, device=torch_cpu_device) for offset in bbox_offsets]

            self._data_batches[i][self._current_data_batch] = (pyt_images, pyt_bboxes, pyt_labels, pyt_offsets)

            # Copy data from DALI Tensors to torch tensors
            for j, i_arr in enumerate(images):
                feed_ndarray(i_arr, pyt_images[j])

            for j, b_list in enumerate(bboxes):
                for k in range(len(b_list)):
                    if (pyt_bboxes[j][k].shape[0] != 0):
                        feed_ndarray(b_list[k], pyt_bboxes[j][k])
                pyt_bboxes[j] = torch.cat(pyt_bboxes[j])

            for j, l_list in enumerate(labels):
                for k in range(len(l_list)):
                    if (pyt_labels[j][k].shape[0] != 0):
                        feed_ndarray(l_list[k], pyt_labels[j][k])
                pyt_labels[j] = torch.cat(pyt_labels[j])

            for j in range(len(pyt_offsets)):
                pyt_offsets[j] = torch.IntTensor(bbox_offsets[j])

        for p in self._pipes:
            p.release_outputs()
            p.schedule_run()

        copy_db_index = self._current_data_batch
        # Change index for double buffering
        self._current_data_batch = (self._current_data_batch + 1) % 2
        self._counter += self._num_gpus * self.batch_size
        return [db[copy_db_index] for db in self._data_batches]

    def next(self):
        """
        Returns the next batch of data.
        """
        return self.__next__();

    def __iter__(self):
        return self

    def reset(self):
        """
        Resets the iterator after the full epoch.
        DALI iterators do not support resetting before the end of the epoch
        and will ignore such request.
        """
        if self._counter > self._size:
            self._counter = self._counter % self._size
        else:
            logging.warning("DALI iterator does not support resetting while epoch is not finished. Ignoring...")