SunnyMirror
/
DeepLearningExamples
mirror de https://github.com/NVIDIA/DeepLearningExamples.git


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
							#!/bin/bash

echo "Container nvidia build = " $NVIDIA_BUILD_ID

init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
train_batch_size=${2:-8}
learning_rate=${3:-3.125e-6}
cased=${4:-false}
precision=${5:-"fp16"}
use_xla=${6:-"true"}
num_gpu=${7:-"16"}
seq_length=${8:-128}
bert_model=${9:-"base"}
eval_batch_size=${10:-8} #Eval and Predict BS is assumed to be same
epochs=${11:-"100.0"}

if [ "$cased" = "true" ] ; then
    DO_LOWER_CASE=0
    CASING_DIR_PREFIX="cased"
    case_flag="--do_lower_case=False"
else
    DO_LOWER_CASE=1
    CASING_DIR_PREFIX="uncased"
    case_flag="--do_lower_case=True"
fi

if [ "$bert_model" = "large" ] ; then
    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
else
    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
fi

export GBS=$(expr $train_batch_size \* $num_gpu)
printf -v TAG "tf_bert_biobert_ner_bc5cdr_disease_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`


DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
mkdir -p ${OUTPUT_DIR}

use_fp16=""
if [ "$precision" = "fp16" ] ; then
    echo "fp16 activated!"
    use_fp16="--amp"
else
    echo "fp32/tf32 activated!"
    use_fp16="--noamp"
fi

if [ "$use_xla" = "true" ] ; then
    use_xla_tag="--use_xla"
    echo "XLA activated"
else
    use_xla_tag="--nouse_xla"
fi

if [ $num_gpu -gt 1 ] ; then
    mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
    --allow-run-as-root -bind-to none -map-by slot \
    -x NCCL_DEBUG=INFO \
    -x LD_LIBRARY_PATH \
    -x PATH -mca pml ob1 -mca btl ^openib"
    use_hvd="--horovod"
else
    mpi_command=""
    use_hvd=""
fi

$mpi_command python3 /workspace/bert/run_ner.py \
  --do_prepare=true \
  --do_train=true \
  --do_eval=true \
  --do_predict=true \
  --task_name="bc5cdr" \
  --vocab_file=$BERT_DIR/vocab.txt \
  --bert_config_file=$BERT_DIR/bert_config.json \
  --init_checkpoint=$init_checkpoint \
  --num_train_epochs=$epochs \
  --data_dir=$DATASET_DIR \
  --output_dir=$OUTPUT_DIR \
  --learning_rate=$learning_rate \
  --train_batch_size=$train_batch_size \
  --eval_batch_size=$eval_batch_size \
  --predict_batch_size=$eval_batch_size \
  --max_seq_length=$seq_length \
  "$use_hvd" "$use_fp16" $use_xla_tag $case_flag