# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
import glob

import numpy as np
import dllogger

from paddle.fluid import LoDTensor
from paddle.inference import Config, PrecisionType, create_predictor

from dali import dali_dataloader, dali_synthetic_dataloader
from utils.config import parse_args, print_args
from utils.mode import Mode
from utils.logger import setup_dllogger


def init_predictor(args):
    infer_dir = args.inference_dir
    assert os.path.isdir(
        infer_dir), f'inference_dir = "{infer_dir}" is not a directory'
    pdiparams_path = glob.glob(os.path.join(infer_dir, '*.pdiparams'))
    pdmodel_path = glob.glob(os.path.join(infer_dir, '*.pdmodel'))
    assert len(pdiparams_path) == 1, \
        f'There should be only 1 pdiparams in {infer_dir}, but there are {len(pdiparams_path)}'
    assert len(pdmodel_path) == 1, \
        f'There should be only 1 pdmodel in {infer_dir}, but there are {len(pdmodel_path)}'
    predictor_config = Config(pdmodel_path[0], pdiparams_path[0])
    predictor_config.enable_memory_optim()
    predictor_config.enable_use_gpu(0, args.device)
    precision = args.precision
    max_batch_size = args.batch_size
    assert precision in ['FP32', 'FP16', 'INT8'], \
        'precision should be FP32/FP16/INT8'
    if precision == 'INT8':
        precision_mode = PrecisionType.Int8
    elif precision == 'FP16':
        precision_mode = PrecisionType.Half
    elif precision == 'FP32':
        precision_mode = PrecisionType.Float32
    else:
        raise NotImplementedError
    predictor_config.enable_tensorrt_engine(
        workspace_size=args.workspace_size,
        max_batch_size=max_batch_size,
        min_subgraph_size=args.min_subgraph_size,
        precision_mode=precision_mode,
        use_static=args.use_static,
        use_calib_mode=args.use_calib_mode)
    predictor_config.set_trt_dynamic_shape_info(
        {"data": (1,) + tuple(args.image_shape)},
        {"data": (args.batch_size,) + tuple(args.image_shape)},
        {"data": (args.batch_size,) + tuple(args.image_shape)},
    )
    predictor = create_predictor(predictor_config)
    return predictor


def predict(predictor, input_data):
    '''
    Args:
        predictor: Paddle inference predictor
        input_data: A list of input
    Returns:
        output_data: A list of output
    '''
    # copy image data to input tensor
    input_names = predictor.get_input_names()
    for i, name in enumerate(input_names):
        input_tensor = predictor.get_input_handle(name)

        if isinstance(input_data[i], LoDTensor):
            input_tensor.share_external_data(input_data[i])
        else:
            input_tensor.reshape(input_data[i].shape)
            input_tensor.copy_from_cpu(input_data[i])

    # do the inference
    predictor.run()

    results = []
    # get out data from output tensor
    output_names = predictor.get_output_names()
    for i, name in enumerate(output_names):
        output_tensor = predictor.get_output_handle(name)
        output_data = output_tensor.copy_to_cpu()
        results.append(output_data)
    return results


def benchmark_dataset(args):
    """
    Benchmark DALI format dataset, which reflects real the pipeline throughput including
    1. Read images
    2. Pre-processing
    3. Inference
    4. H2D, D2H
    """
    predictor = init_predictor(args)

    dali_iter = dali_dataloader(args, Mode.EVAL, 'gpu:' + str(args.device))

    # Warmup some samples for the stable performance number
    batch_size = args.batch_size
    image_shape = args.image_shape
    images = np.zeros((batch_size, *image_shape)).astype(np.float32)
    for _ in range(args.benchmark_warmup_steps):
        predict(predictor, [images])[0]

    total_images = 0
    correct_predict = 0

    latency = []

    start = time.perf_counter()
    last_time_step = time.perf_counter()
    for dali_data in dali_iter:
        for data in dali_data:
            label = np.asarray(data['label'])
            total_images += label.shape[0]
            label = label.flatten()
            images = data['data']
            predict_label = predict(predictor, [images])[0]
            correct_predict += (label == predict_label).sum()
        batch_end_time_step = time.perf_counter()
        batch_latency = batch_end_time_step - last_time_step
        latency.append(batch_latency)
        last_time_step = time.perf_counter()
    end = time.perf_counter()

    latency = np.array(latency) * 1000
    quantile = np.quantile(latency, [0.9, 0.95, 0.99])

    statistics = {
        'precision': args.precision,
        'batch_size': batch_size,
        'throughput': total_images / (end - start),
        'accuracy': correct_predict / total_images,
        'eval_latency_avg': np.mean(latency),
        'eval_latency_p90': quantile[0],
        'eval_latency_p95': quantile[1],
        'eval_latency_p99': quantile[2],
    }
    return statistics


def benchmark_synthetic(args):
    """
    Benchmark on the synthetic data and bypass all pre-processing.
    The host to device copy is still included.
    This used to find the upper throughput bound when tunning the full input pipeline.
    """

    predictor = init_predictor(args)
    dali_iter = dali_synthetic_dataloader(args, 'gpu:' + str(args.device))

    batch_size = args.batch_size
    image_shape = args.image_shape
    images = np.random.random((batch_size, *image_shape)).astype(np.float32)

    latency = []

    # warmup
    for _ in range(args.benchmark_warmup_steps):
        predict(predictor, [images])[0]

    # benchmark
    start = time.perf_counter()
    last_time_step = time.perf_counter()
    for dali_data in dali_iter:
        for data in dali_data:
            images = data['data']
            predict(predictor, [images])[0]
        batch_end_time_step = time.perf_counter()
        batch_latency = batch_end_time_step - last_time_step
        latency.append(batch_latency)
        last_time_step = time.perf_counter()
    end = time.perf_counter()

    latency = np.array(latency) * 1000
    quantile = np.quantile(latency, [0.9, 0.95, 0.99])

    statistics = {
        'precision': args.precision,
        'batch_size': batch_size,
        'throughput': args.benchmark_steps * batch_size / (end - start),
        'eval_latency_avg': np.mean(latency),
        'eval_latency_p90': quantile[0],
        'eval_latency_p95': quantile[1],
        'eval_latency_p99': quantile[2],
    }
    return statistics

def main(args):
    setup_dllogger(args.report_file)
    if args.show_config:
        print_args(args)

    if args.use_synthetic:
        statistics = benchmark_synthetic(args)
    else:
        statistics = benchmark_dataset(args)

    dllogger.log(step=tuple(), data=statistics)


if __name__ == '__main__':
    main(parse_args(script='inference'))