SunnyMirror
/
DeepLearningExamples
oglindă de https://github.com/NVIDIA/DeepLearningExamples.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
							#! /bin/bash

# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Examples:
# to run on a DGX2 with a frequency limit of 3 (will need 8xV100-32GB to fit the model in GPU memory)
# ./prepare_dataset.sh DGX2 3
#
# to run on a DGX2 with a frequency limit of 15 (should fit on a single V100-32GB):
# ./prepare_dataset.sh DGX2 15
#
# to run on CPU with a frequency limit of 15:
# ./prepare_dataset.sh CPU 15


set -e
set -x

ls -ltrash

download_dir=${download_dir:-'/data/criteo_orig'}
./verify_criteo_downloaded.sh ${download_dir}

spark_output_path=${spark_output_path:-'/data/spark/output'}


if [ -f ${spark_output_path}/train/_SUCCESS ] \
   && [ -f ${spark_output_path}/validation/_SUCCESS ] \
   && [ -f ${spark_output_path}/test/_SUCCESS ]; then

   echo "Spark preprocessing already carried out"
else
   echo "Performing spark preprocessing"
   ./run_spark.sh $1 ${download_dir} ${spark_output_path} $2
fi

conversion_intermediate_dir=${conversion_intermediate_dir:-'/data/intermediate_binary'}
final_output_dir=${final_output_dir:-'/data/preprocessed'}


if [ -d ${final_output_dir}/train ] \
   && [ -d ${final_output_dir}/validation ] \
   && [ -d ${final_output_dir}/test ] \
   && [ -f ${final_output_dir}/feature_spec.yaml ]; then

    echo "Final conversion already done"
else
    echo "Performing final conversion to a custom data format"
    python parquet_to_binary.py --parallel_jobs 40 --src_dir ${spark_output_path} \
                                --intermediate_dir  ${conversion_intermediate_dir} \
                                --dst_dir ${final_output_dir}

    cp "${spark_output_path}/model_size.json" "${final_output_dir}/model_size.json"

    python split_dataset.py --dataset "${final_output_dir}" --output "${final_output_dir}/split"
    rm ${final_output_dir}/train_data.bin
    rm ${final_output_dir}/validation_data.bin
    rm ${final_output_dir}/test_data.bin
    rm ${final_output_dir}/model_size.json

    mv ${final_output_dir}/split/* ${final_output_dir}
    rm -rf ${final_output_dir}/split
fi

echo "Done preprocessing the Criteo Kaggle Dataset"