6 жил өмнө · 3d59216cec
--- a/PyTorch/LanguageModeling/BERT/.dockerignore
+++ b/PyTorch/LanguageModeling/BERT/.dockerignore
@@ -5,3 +5,4 @@ data/sharded/
 
				 data/hdf5/
			
 
				 vocab/
			
 
				 results/
			
 
				+checkpoints/*
			
--- a/PyTorch/LanguageModeling/BERT/.gitignore
+++ b/PyTorch/LanguageModeling/BERT/.gitignore
@@ -11,6 +11,7 @@ __pycache__/
 
				 #Data       
			
 
				 data/*/*/   
			
 
				 data/*/*.zip
			
 
				+data/*
			
 
				 
			
 
				 # Distribution / packaging
			
 
				 .Python
			
--- a/PyTorch/LanguageModeling/BERT/Dockerfile
+++ b/PyTorch/LanguageModeling/BERT/Dockerfile
@@ -199,11 +199,7 @@ If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/c
 
				 
			
 
				 4. Start an interactive session in the NGC container to run training/inference.
			
 
				 
			
 
				-`bash scripts/docker/launch.sh <DATA_DIR> <VOCAB_DIR> <CHECKPOINT_DIR> <RESULTS_DIR>`
			
 
				-
			
 
				-`<DATA_DIR>` - Path to `data` folder in the cloned repository. This directory contains scripts needed to download datasets and where the data will be downloaded.
			
 
				-
			
 
				-`<VOCAB_DIR>` - Path to `vocab` folder in the cloned repository. This is the vocabulary with which BERT checkpoint is pretrained.
			
 
				+`bash scripts/docker/launch.sh <CHECKPOINT_DIR> <RESULTS_DIR>`
			
 
				 
			
 
				 `<CHECKPOINT_DIR>` - Path to folder containing the downloaded pretrained checkpoint from step 2 for fine-tuning.
			
 
				 
			
@@ -211,7 +207,7 @@ If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/c
 
				 
			
 
				 The above paths present on the local machine get mounted to predefined locations in the container.
			
 
				 
			
 
				-`data` and `vocab` are a part of `.dockerignore` in order to provide the user the ability to mount datasets of choice and not necessarily the ones downloaded by the script below. In this case, `<DATA_DIR>` points to users corpus. Refer to the [Getting the data](#getting-the-data) section for more details on how to process a custom corpus as required for BERT pretraining.
			
 
				+`data` and `vocab.txt` are downloaded in `data/` directory by default. Refer to the [Getting the data](#getting-the-data) section for more details on how to process a custom corpus as required for BERT pretraining.
			
 
				 
			
 
				 5. Download and preprocess the dataset.
			
 
				 
			
--- a/PyTorch/LanguageModeling/BERT/bert_config.json
+++ b/PyTorch/LanguageModeling/BERT/bert_config.json
--- a/PyTorch/LanguageModeling/BERT/create_pretraining_data.py
+++ b/PyTorch/LanguageModeling/BERT/create_pretraining_data.py
@@ -1,30 +0,0 @@
 
				-Steps to reproduce datasets from web
			
 
				-
			
 
				-1) Build the container
			
 
				-  * docker build -t bert_prep .
			
 
				-2) Run the container interactively
			
 
				-  * nvidia-docker run -it --ipc=host bert_prep
			
 
				-  * Optional: Mount data volumes
			
 
				-    * -v yourpath:/workspace/bert/data/wikipedia_corpus/download
			
 
				-    * -v yourpath:/workspace/bert/data/wikipedia_corpus/extracted_articles
			
 
				-    * -v yourpath:/workspace/bert/data/wikipedia_corpus/raw_data
			
 
				-    * -v yourpath:/workspace/bert/data/wikipedia_corpus/intermediate_files
			
 
				-    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_file_single
			
 
				-    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_files_sharded
			
 
				-    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
			
 
				-    * -v yourpath:/workspace/bert/data/bookcorpus/download
			
 
				-    * -v yourpath:/workspace/bert/data/bookcorpus/final_text_file_single
			
 
				-    * -v yourpath:/workspace/bert/data/bookcorpus/final_text_files_sharded
			
 
				-    * -v yourpath:/workspace/bert/data/bookcorpus/final_tfrecords_sharded
			
 
				-  * Optional: Select visible GPUs
			
 
				-    * -e CUDA_VISIBLE_DEVICES=0
			
 
				-
			
 
				-** Inside of the container starting here**
			
 
				-3) Download pretrained weights (they contain vocab files for preprocessing)
			
 
				-  * cd data/pretrained_models_google && python3 download_models.py
			
 
				-4) "One-click" Wikipedia data download and prep (provides tfrecords)
			
 
				-  * Set your configuration in data/wikipedia_corpus/config.sh
			
 
				-  * cd /data/wikipedia_corpus && ./run_preprocessing.sh
			
 
				-5) "One-click" BookCorpus data download and prep (provided tfrecords)
			
 
				-  * Set your configuration in data/wikipedia_corpus/config.sh
			
 
				-  * cd /data/bookcorpus && ./run_preprocessing.sh
			
--- a/PyTorch/LanguageModeling/BERT/data/bertPrep.py
+++ b/PyTorch/LanguageModeling/BERT/data/bertPrep.py
@@ -23,13 +23,17 @@ def main(args):
 
				     if args.input_files:
			
 
				         args.input_files = args.input_files.split(',')
			
 
				 
			
 
				+    hdf5_tfrecord_folder_prefix = "_lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
			
 
				+                                  + "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
			
 
				+                                  + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor)
			
 
				+
			
 
				     directory_structure = {
			
 
				         'download' : working_dir + '/download',    # Downloaded and decompressed
			
 
				         'extracted' : working_dir +'/extracted',    # Extracted from whatever the initial format is (e.g., wikiextractor)
			
 
				         'formatted' : working_dir + '/formatted_one_article_per_line',    # This is the level where all sources should look the same
			
 
				-        'sharded' : working_dir + '/sharded',
			
 
				-        'tfrecord' : working_dir + '/tfrecord',
			
 
				-        'hdf5': working_dir + '/hdf5'
			
 
				+        'sharded' : working_dir + '/sharded_' + "training_shards_" + str(args.n_training_shards) + "_test_shards_" + str(args.n_test_shards) + "_fraction_" + str(args.fraction_test_set),
			
 
				+        'tfrecord' : working_dir + '/tfrecord'+ hdf5_tfrecord_folder_prefix,
			
 
				+        'hdf5': working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix
			
 
				     }
			
 
				 
			
 
				     print('\nDirectory Structure:')
			
@@ -100,8 +104,7 @@ def main(args):
 
				                 elif args.dataset == 'books_wiki_en_corpus':
			
 
				                     args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
			
 
				 
			
 
				-            if args.output_file_prefix is None:
			
 
				-                args.output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
			
 
				+            output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
			
 
				 
			
 
				             if not os.path.exists(directory_structure['sharded']):
			
 
				                 os.makedirs(directory_structure['sharded'])
			
@@ -114,7 +117,7 @@ def main(args):
 
				             # Different languages (e.g., Chinese simplified/traditional) may require translation and
			
 
				             # other packages to be called from here -- just add a conditional branch for those extra steps
			
 
				             segmenter = TextSharding.NLTKSegmenter()
			
 
				-            sharding = TextSharding.Sharding(args.input_files, args.output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
			
 
				+            sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
			
 
				 
			
 
				             sharding.load_articles()
			
 
				             sharding.segment_articles_into_sentences(segmenter)
			
@@ -127,15 +130,15 @@ def main(args):
 
				     elif args.action == 'create_tfrecord_files':
			
 
				         assert False, 'TFrecord creation not supported in this PyTorch model example release.' \
			
 
				                       ''
			
 
				-        if not os.path.exists(directory_structure['tfrecord']):
			
 
				-            os.makedirs(directory_structure['tfrecord'])
			
 
				+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
			
 
				+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
			
 
				 
			
 
				         def create_record_worker(filename_prefix, shard_id, output_format='tfrecord'):
			
 
				             bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
			
 
				             bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
			
 
				             bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
			
 
				             bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
			
 
				-            bert_preprocessing_command += ' --do_lower_case=' + 'true' if args.do_lower_case else 'false'
			
 
				+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
			
 
				             bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
			
 
				             bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
			
 
				             bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
			
@@ -149,14 +152,17 @@ def main(args):
 
				             # This could be better optimized (fine if all take equal time)
			
 
				             if shard_id % args.n_processes == 0 and shard_id > 0:
			
 
				                 bert_preprocessing_process.wait()
			
 
				+            return last_process
			
 
				+
			
 
				+        output_file_prefix = args.dataset
			
 
				 
			
 
				         for i in range(args.n_training_shards):
			
 
				-            create_record_worker(args.output_file_prefix + '_training', i)
			
 
				+            last_process =create_record_worker(output_file_prefix + '_training', i)
			
 
				 
			
 
				         last_process.wait()
			
 
				 
			
 
				         for i in range(args.n_test_shards):
			
 
				-            create_record_worker(args.output_file_prefix + '_test', i)
			
 
				+            last_process = create_record_worker(output_file_prefix + '_test', i)
			
 
				 
			
 
				         last_process.wait()
			
 
				 
			
@@ -164,17 +170,20 @@ def main(args):
 
				     elif args.action == 'create_hdf5_files':
			
 
				         last_process = None
			
 
				 
			
 
				+        if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
			
 
				+            os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
			
 
				+
			
 
				         def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
			
 
				             bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
			
 
				             bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
			
 
				-            bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
			
 
				+            bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
			
 
				             bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
			
 
				             bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
			
 
				-            bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
			
 
				-            bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
			
 
				-            bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
			
 
				-            bert_preprocessing_command += ' --random_seed=' + args.random_seed
			
 
				-            bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
			
 
				+            bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
			
 
				+            bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
			
 
				+            bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
			
 
				+            bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
			
 
				+            bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
			
 
				             bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
			
 
				             bert_preprocessing_process.communicate()
			
 
				 
			
@@ -183,14 +192,17 @@ def main(args):
 
				             # This could be better optimized (fine if all take equal time)
			
 
				             if shard_id % args.n_processes == 0 and shard_id > 0:
			
 
				                 bert_preprocessing_process.wait()
			
 
				+            return last_process
			
 
				+
			
 
				+        output_file_prefix = args.dataset
			
 
				 
			
 
				         for i in range(args.n_training_shards):
			
 
				-            create_record_worker(args.output_file_prefix + '_training', i)
			
 
				+            last_process = create_record_worker(output_file_prefix + '_training', i)
			
 
				 
			
 
				         last_process.wait()
			
 
				 
			
 
				         for i in range(args.n_test_shards):
			
 
				-            create_record_worker(args.output_file_prefix + '_test', i)
			
 
				+            last_process = create_record_worker(output_file_prefix + '_test', i)
			
 
				 
			
 
				         last_process.wait()
			
 
				 
			
@@ -236,12 +248,6 @@ if __name__ == "__main__":
 
				         help='Specify the input files in a comma-separated list (no spaces)'
			
 
				     )
			
 
				 
			
 
				-    parser.add_argument(
			
 
				-        '--output_file_prefix',
			
 
				-        type=str,
			
 
				-        help='Specify the naming convention (prefix) of the output files'
			
 
				-    )
			
 
				-
			
 
				     parser.add_argument(
			
 
				         '--n_training_shards',
			
 
				         type=int,
			
--- a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
+++ b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
@@ -20,8 +20,10 @@ python3 /workspace/bert/data/bertPrep.py --action sharding --dataset books_wiki_
 
				 
			
 
				 
			
 
				 # Create HDF5 files Phase 1
			
 
				-python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 128 --max_predictions_per_seq 20
			
 
				+python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 128 \
			
 
				+ --max_predictions_per_seq 20 --vocab_file /workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
			
 
				 
			
 
				 
			
 
				 # Create HDF5 files Phase 2
			
 
				-python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 --max_predictions_per_seq 80
			
 
				+python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 \
			
 
				+ --max_predictions_per_seq 80 --vocab_file /workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
			
--- a/PyTorch/LanguageModeling/BERT/extract_features.py
+++ b/PyTorch/LanguageModeling/BERT/extract_features.py
--- a/PyTorch/LanguageModeling/BERT/run_glue.py
+++ b/PyTorch/LanguageModeling/BERT/run_glue.py
@@ -443,7 +443,7 @@ def main():
 
				             thread = None
			
 
				             if not args.resume_from_checkpoint or epoch > 0 or args.phase2:
			
 
				                 files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
			
 
				-                         os.path.isfile(os.path.join(args.input_dir, f))]
			
 
				+                         os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f]
			
 
				                 files.sort()
			
 
				                 num_files = len(files)
			
 
				                 random.shuffle(files)
			
--- a/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
@@ -213,7 +213,7 @@ def main():
 
				     if multi_gpu_training:
			
 
				         model = DDP(model)
			
 
				    
			
 
				-    files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
			
 
				+    files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'test' in f]
			
 
				     files.sort()
			
 
				 
			
 
				       
			
--- a/PyTorch/LanguageModeling/BERT/run_squad.py
+++ b/PyTorch/LanguageModeling/BERT/run_squad.py
@@ -1,7 +1,5 @@
 
				 #!/bin/bash
			
 
				 
			
 
				-DATA_DIR=${1:-"${PWD}/data/hdf5/books_wiki_en_corpus"}
			
 
				-VOCAB_DIR=${2:-"${PWD}/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16"}
			
 
				 CHECKPOINT_DIR=${3:-"${PWD}/checkpoints"}
			
 
				 RESULTS_DIR=${4:-"${PWD}/results"}
			
 
				 
			
@@ -11,8 +9,6 @@ docker run -it --rm \
 
				   --ulimit memlock=-1 \
			
 
				   --ulimit stack=67108864 \
			
 
				   -v ${PWD}:/workspace/bert \
			
 
				-  -v $DATA_DIR:/workspace/bert/data/hdf5/books_wiki_en_corpus \
			
 
				   -v $CHECKPOINT_DIR:/workspace/checkpoints \
			
 
				-  -v $VOCAB_DIR:/workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16 \
			
 
				   -v $RESULTS_DIR:/results \
			
 
				   bert_pyt bash
			
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
@@ -16,7 +16,7 @@ seed=${12:-$RANDOM}
 
				 job_name=${13:-"bert_lamb_pretraining"}
			
 
				 allreduce_post_accumulation=${14:-"true"}
			
 
				 allreduce_post_accumulation_fp16=${15:-"true"}
			
 
				-accumulate_into_fp16=${16:-"true"}
			
 
				+accumulate_into_fp16=${16:-"false"}
			
 
				 
			
 
				 train_batch_size_phase2=${1:-4096}
			
 
				 learning_rate_phase2=${2:-"4e-3"}
			
@@ -24,9 +24,9 @@ warmup_proportion_phase2=${5:-"0.128"}
 
				 train_steps_phase2=${6:-1563}
			
 
				 gradient_accumulation_steps_phase2=${11:-512}
			
 
				 
			
 
				-DATASET=books_wiki_en_corpus # change this for other datasets
			
 
				+DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
			
 
				 
			
 
				-DATA_DIR=data/${DATASET}/training/
			
 
				+DATA_DIR=data/${DATASET}/
			
 
				 #DATA_DIR=data/hdf5/wiki+book/bert_pytorch_wikipedia_bookcorpus_interseqmix_seq_128_pred_20/
			
 
				 BERT_CONFIG=bert_config.json
			
 
				 RESULTS_DIR=/results
			
@@ -120,7 +120,7 @@ fi
 
				 
			
 
				 if [ "$create_logfile" = "true" ] ; then
			
 
				   export GBS=$(expr $train_batch_size \* $num_gpus)
			
 
				-  printf -v TAG "pyt_bert_pretraining_%s_gbs%d" "$precision" $GBS
			
 
				+  printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
			
 
				   DATESTAMP=`date +'%y%m%d%H%M%S'`
			
 
				   LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
			
 
				   printf "Logs written to %s\n" "$LOGFILE"
			
@@ -154,9 +154,9 @@ echo "final loss: $final_loss"
 
				 
			
 
				 #Start Phase2
			
 
				 
			
 
				-DATASET=merged_wiki+books_phase2 # change this for other datasets
			
 
				+DATASET=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
			
 
				 
			
 
				-DATA_DIR=data/${DATASET}/hdf5_shards/
			
 
				+DATA_DIR=data/${DATASET}/
			
 
				 #DATA_DIR=data/hdf5/wiki+book/bert_pytorch_wikipedia_bookcorpus_interseqmix_seq_512_pred_80/
			
 
				 
			
 
				 PREC=""
			
@@ -220,8 +220,8 @@ fi
 
				 
			
 
				 
			
 
				 if [ "$create_logfile" = "true" ] ; then
			
 
				-  export GBS=$(expr $train_batch_size \* $num_gpus)
			
 
				-  printf -v TAG "pyt_bert_pretraining_%s_gbs%d" "$precision" $GBS
			
 
				+  export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
			
 
				+  printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
			
 
				   DATESTAMP=`date +'%y%m%d%H%M%S'`
			
 
				   LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
			
 
				   printf "Logs written to %s\n" "$LOGFILE"
			
--- a/PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
@@ -12,7 +12,7 @@ learning_rate=${4:-"3e-5"}
 
				 precision=${5:-"fp16"}
			
 
				 num_gpu=${6:-"8"}
			
 
				 seed=${7:-"1"}
			
 
				-squad_dir=${8:-"/workspace/bert/data/squad/v1.1"}
			
 
				+squad_dir=${8:-"/workspace/bert/data/download/squad/v1.1"}
			
 
				 vocab_file=${9:-"/workspace/bert/data/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
			
 
				 OUT_DIR=${10:-"/results/SQuAD"}
			
 
				 mode=${11:-"train eval"}
			
--- a/PyTorch/LanguageModeling/BERT/tokenization.py
+++ b/PyTorch/LanguageModeling/BERT/tokenization.py