prepare_dataset.sh 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # Copyright (c) 2018, deepakn94, robieta. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #
  15. # -----------------------------------------------------------------------
  16. #
  17. # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  18. #
  19. # Licensed under the Apache License, Version 2.0 (the "License");
  20. # you may not use this file except in compliance with the License.
  21. # You may obtain a copy of the License at
  22. #
  23. # http://www.apache.org/licenses/LICENSE-2.0
  24. #
  25. # Unless required by applicable law or agreed to in writing, software
  26. # distributed under the License is distributed on an "AS IS" BASIS,
  27. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  28. # See the License for the specific language governing permissions and
  29. # limitations under the License.
  30. #!/bin/bash
  31. set -e
  32. set -x
  33. DATASET_NAME=${1:-'ml-20m'}
  34. RAW_DATADIR=${2:-"/data/${DATASET_NAME}"}
  35. CACHED_DATADIR=${3:-"/data/cache/${DATASET_NAME}"}
  36. # you can add another option to this case in order to support other datasets
  37. case ${DATASET_NAME} in
  38. 'ml-20m')
  39. ZIP_PATH=${RAW_DATADIR}/'ml-20m.zip'
  40. SHOULD_UNZIP=1
  41. RATINGS_PATH=${RAW_DATADIR}'/ml-20m/ratings.csv'
  42. ;;
  43. 'ml-1m')
  44. ZIP_PATH=${RAW_DATADIR}/'ml-1m.zip'
  45. SHOULD_UNZIP=1
  46. RATINGS_PATH=${RAW_DATADIR}'/ml-1m/ratings.dat'
  47. ;;
  48. *)
  49. echo "Using unknown dataset: $DATASET_NAME."
  50. RATINGS_PATH=${RAW_DATADIR}'/ratings.csv'
  51. echo "Expecting file at ${RATINGS_PATH}"
  52. SHOULD_UNZIP=0
  53. esac
  54. if [ ! -d ${RAW_DATADIR} ]; then
  55. mkdir -p ${RAW_DATADIR}
  56. fi
  57. if [ ! -d ${CACHED_DATADIR} ]; then
  58. mkdir -p ${CACHED_DATADIR}
  59. fi
  60. if [ -f log ]; then
  61. rm -f log
  62. fi
  63. if [ ! -f ${RATINGS_PATH} ]; then
  64. if [ $SHOULD_UNZIP == 1 ]; then
  65. if [ ! -f ${ZIP_PATH} ]; then
  66. echo "Dataset not found. Please download it from: https://grouplens.org/datasets/movielens/20m/ and put it in ${ZIP_PATH}"
  67. exit 1
  68. fi
  69. unzip -u ${ZIP_PATH} -d ${RAW_DATADIR}
  70. else
  71. echo "File not found at ${RATINGS_PATH}. Aborting."
  72. exit 1
  73. fi
  74. fi
  75. if [ ! -f ${CACHED_DATADIR}/feature_spec.yaml ]; then
  76. echo "preprocessing ${RATINGS_PATH} and save to disk"
  77. t0=$(date +%s)
  78. python convert.py --path ${RATINGS_PATH} --output ${CACHED_DATADIR}
  79. t1=$(date +%s)
  80. delta=$(( $t1 - $t0 ))
  81. echo "Finish preprocessing in $delta seconds"
  82. else
  83. echo 'Using cached preprocessed data'
  84. fi
  85. echo "Dataset $DATASET_NAME successfully prepared at: $CACHED_DATADIR"
  86. echo "You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> --use_env ncf.py --data ${CACHED_DATADIR}"