generate_filelists.sh 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. #!/usr/bin/env bash
  2. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. set -eu
  16. : ${DATASET_DIR:=/datasets/LibriSpeech}
  17. : ${FILELISTS_DIR:=$DATASET_DIR}
  18. : ${EXT:=flac} # or wav
  19. mkdir -p $DATASET_DIR
  20. mkdir -p $FILELISTS_DIR
  21. for SUBSET in train-clean-100 train-clean-360 train-other-500 \
  22. dev-clean dev-other test-clean test-other \
  23. ; do
  24. TSV=$FILELISTS_DIR/$SUBSET.tsv
  25. if [ ! -d $DATASET_DIR/$SUBSET ]; then
  26. echo "ERROR: $DATASET_DIR/$SUBSET does not exist; skipping."
  27. continue
  28. fi
  29. python3 utils/generate_filelist.py --extension $EXT $DATASET_DIR/$SUBSET $TSV
  30. python3 utils/libri_labels.py $TSV --output-dir $FILELISTS_DIR --output-name $SUBSET
  31. done
  32. # Combine
  33. python3 utils/combine_filelists.py $FILELISTS_DIR/train-{clean-100,clean-360,other-500}.tsv > $FILELISTS_DIR/train-full-960.tsv
  34. cat $FILELISTS_DIR/train-clean-100.wrd > $FILELISTS_DIR/train-full-960.wrd
  35. cat $FILELISTS_DIR/train-clean-360.wrd >> $FILELISTS_DIR/train-full-960.wrd
  36. cat $FILELISTS_DIR/train-other-500.wrd >> $FILELISTS_DIR/train-full-960.wrd
  37. cat $FILELISTS_DIR/train-clean-100.ltr > $FILELISTS_DIR/train-full-960.ltr
  38. cat $FILELISTS_DIR/train-clean-360.ltr >> $FILELISTS_DIR/train-full-960.ltr
  39. cat $FILELISTS_DIR/train-other-500.ltr >> $FILELISTS_DIR/train-full-960.ltr
  40. python3 utils/generate_dictionary.py $FILELISTS_DIR/train-full-960.ltr $FILELISTS_DIR/dict.ltr.txt