prepare_dataset.sh 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. #! /bin/bash
  2. # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # Examples:
  16. # to run on a DGX2 with a frequency limit of 3 (will need 8xV100-32GB to fit the model in GPU memory)
  17. # ./prepare_dataset.sh DGX2 3
  18. #
  19. # to run on a DGX2 with a frequency limit of 15 (should fit on a single V100-32GB):
  20. # ./prepare_dataset.sh DGX2 15
  21. #
  22. # to run on CPU with a frequency limit of 15:
  23. # ./prepare_dataset.sh CPU 15
  24. set -e
  25. set -x
  26. ls -ltrash
  27. download_dir=${download_dir:-'/data/criteo_orig'}
  28. ./verify_criteo_downloaded.sh ${download_dir}
  29. spark_output_path=${spark_output_path:-'/data/spark/output'}
  30. if [ -f ${spark_output_path}/train/_SUCCESS ] \
  31. && [ -f ${spark_output_path}/validation/_SUCCESS ] \
  32. && [ -f ${spark_output_path}/test/_SUCCESS ]; then
  33. echo "Spark preprocessing already carried out"
  34. else
  35. echo "Performing spark preprocessing"
  36. ./run_spark.sh $1 ${download_dir} ${spark_output_path} $2
  37. fi
  38. conversion_intermediate_dir=${conversion_intermediate_dir:-'/data/intermediate_binary'}
  39. final_output_dir=${final_output_dir:-'/data/preprocessed'}
  40. if [ -d ${final_output_dir}/train ] \
  41. && [ -d ${final_output_dir}/validation ] \
  42. && [ -d ${final_output_dir}/test ] \
  43. && [ -f ${final_output_dir}/feature_spec.yaml ]; then
  44. echo "Final conversion already done"
  45. else
  46. echo "Performing final conversion to a custom data format"
  47. python parquet_to_binary.py --parallel_jobs 40 --src_dir ${spark_output_path} \
  48. --intermediate_dir ${conversion_intermediate_dir} \
  49. --dst_dir ${final_output_dir}
  50. cp "${spark_output_path}/model_size.json" "${final_output_dir}/model_size.json"
  51. python split_dataset.py --dataset "${final_output_dir}" --output "${final_output_dir}/split"
  52. rm ${final_output_dir}/train_data.bin
  53. rm ${final_output_dir}/validation_data.bin
  54. rm ${final_output_dir}/test_data.bin
  55. rm ${final_output_dir}/model_size.json
  56. mv ${final_output_dir}/split/* ${final_output_dir}
  57. rm -rf ${final_output_dir}/split
  58. fi
  59. echo "Done preprocessing the Criteo Kaggle Dataset"