convert_librispeech.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. #!/usr/bin/env python
  15. import argparse
  16. import os
  17. import glob
  18. import multiprocessing
  19. import json
  20. import pandas as pd
  21. from preprocessing_utils import parallel_preprocess
  22. parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.')
  23. parser.add_argument('--input_dir', type=str, required=True,
  24. help='LibriSpeech collection input dir')
  25. parser.add_argument('--dest_dir', type=str, required=True,
  26. help='Output dir')
  27. parser.add_argument('--output_json', type=str, default='./',
  28. help='name of the output json file.')
  29. parser.add_argument('-s','--speed', type=float, nargs='*',
  30. help='Speed perturbation ratio')
  31. parser.add_argument('--target_sr', type=int, default=None,
  32. help='Target sample rate. '
  33. 'defaults to the input sample rate')
  34. parser.add_argument('--overwrite', action='store_true',
  35. help='Overwrite file if exists')
  36. parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(),
  37. help='Number of threads to use when processing audio files')
  38. args = parser.parse_args()
  39. args.input_dir = args.input_dir.rstrip('/')
  40. args.dest_dir = args.dest_dir.rstrip('/')
  41. def build_input_arr(input_dir):
  42. txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
  43. recursive=True)
  44. input_data = []
  45. for txt_file in txt_files:
  46. rel_path = os.path.relpath(txt_file, input_dir)
  47. with open(txt_file) as fp:
  48. for line in fp:
  49. fname, _, transcript = line.partition(' ')
  50. input_data.append(dict(input_relpath=os.path.dirname(rel_path),
  51. input_fname=fname+'.flac',
  52. transcript=transcript))
  53. return input_data
  54. print("[%s] Scaning input dir..." % args.output_json)
  55. dataset = build_input_arr(input_dir=args.input_dir)
  56. print("[%s] Converting audio files..." % args.output_json)
  57. dataset = parallel_preprocess(dataset=dataset,
  58. input_dir=args.input_dir,
  59. dest_dir=args.dest_dir,
  60. target_sr=args.target_sr,
  61. speed=args.speed,
  62. overwrite=args.overwrite,
  63. parallel=args.parallel)
  64. print("[%s] Generating json..." % args.output_json)
  65. df = pd.DataFrame(dataset, dtype=object)
  66. # Save json with python. df.to_json() produces back slashed in file paths
  67. dataset = df.to_dict(orient='records')
  68. with open(args.output_json, 'w') as fp:
  69. json.dump(dataset, fp, indent=2)