diff --git a/cv/classification/inceptionv3/tensorflow/get_num_devices.sh b/cv/classification/inceptionv3/tensorflow/get_num_devices.sh index 7c6036a715270f99169e8cc384ba8ae12af26a60..a9c3708955c785f98159cfa872157538f6aee1a8 100644 --- a/cv/classification/inceptionv3/tensorflow/get_num_devices.sh +++ b/cv/classification/inceptionv3/tensorflow/get_num_devices.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -19,8 +19,8 @@ if [ -n "$devices" ]; then _devices=(${devices//,/ }) num_devices=${#_devices[@]} else - num_devices=8 - export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + num_devices=2 + export CUDA_VISIBLE_DEVICES=0,1 echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}" fi export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} diff --git a/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh index 1787fb081eb448fe832764edeb9fb0fc606123ba..1abf5d2c05bc628985e1e32ede54848e3829e366 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh @@ -25,7 +25,8 @@ export TF_CPP_MIN_LOG_LEVEL=1 i=0 model="alexnet" -for arg in "$@"; do +for arg in "$@" +do if [ $i -eq 0 ]; then model=$arg let i++ @@ -40,11 +41,12 @@ for arg in "$@"; do done echo "## Training model: ${model}" + : ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/${model}_distributed" DATA_DIR=./imagenette @@ -56,7 +58,8 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi @@ -66,7 +69,7 @@ check_status() { # Prepare devices ################################################# devices=$CUDA_VISIBLE_DEVICES -if [ -n "$devices" ]; then +if [ -n "$devices" ]; then devices=(${devices//,/ }) num_devices=${#devices[@]} else @@ -86,7 +89,8 @@ fi ################################################# worker_hosts="" i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do if [ "$i" == "0" ]; then let i++ continue @@ -102,13 +106,13 @@ echo "worker_hosts: ${worker_hosts}" ################################################# trap ctrl_c INT function ctrl_c() { - echo "*** Trapped CTRL-C, killing process running background" - for pid in "${pid_list[@]}"; do - echo "Killing pid ${pid}" - kill ${pid} - wait ${pid} - done - exit 0 + echo "*** Trapped CTRL-C, killing process running background" + for pid in "${pid_list[@]}"; do + echo "Killing pid ${pid}" + kill ${pid} + wait ${pid} + done + exit 0 } ################################################# @@ -116,9 +120,10 @@ function ctrl_c() { ################################################# pid_list=() -last_device=$(expr ${num_devices} - 1) +last_device=`expr ${num_devices} - 1` i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do job_name="worker" if [ "${i}" == "0" ]; then job_name="ps" @@ -127,26 +132,30 @@ for device in "${devices[@]}"; do if [ ${i} -le 1 ]; then task_index=0 else - task_index=$(expr ${i} - 1) + task_index=`expr ${i} - 1` fi if [ "${i}" == "${last_device}" ]; then - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated \ - --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log - [[ ${PIPESTATUS[0]} == 0 ]] || exit + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated \ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}" else - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model}\ + --variable_update=distributed_replicated\ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}" fi let i++ diff --git a/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh index 12337021f8e73fc39f05e2e487812626730ca96c..9c44c920f824220dc29d47550e4e5d33aedc547f 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=rmsprop -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/inception3" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -51,10 +53,14 @@ for arg in "$@"; do let i++ done -python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=1 --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=1\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit exit ${EXIT_STATUS} diff --git a/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh index c48d0cbfd97bf0571cc52dfa9a8a8686e5178b07..6a8938ec5f1daf3bbdb0ad57d3151518d2aa171e 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=rmsprop -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/inception3_multigpu" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -53,11 +55,14 @@ done source ./get_num_devices.sh -UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit exit ${EXIT_STATUS} diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn.py b/cv/classification/resnet50/tensorflow/benchmark_cnn.py index 6f65ea69b46f479a649c81aaddc797f30809c1ae..7f6c1db4c5ee81194c43a463c5bfad1235b0a04f 100644 --- a/cv/classification/resnet50/tensorflow/benchmark_cnn.py +++ b/cv/classification/resnet50/tensorflow/benchmark_cnn.py @@ -31,9 +31,11 @@ import re import threading import time import traceback +import sys from absl import flags as absl_flags import numpy as np +import math import six from six.moves import xrange # pylint: disable=redefined-builtin @@ -881,6 +883,9 @@ def benchmark_one_step(sess, lossval = results['average_loss'] else: lossval = 0. + if not math.isfinite(lossval): + print("Loss is {}, stopping training".format(lossval)) + sys.exit(1) if image_producer is not None: image_producer.notify_image_consumption() train_time = time.time() - start_time diff --git a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh index 98639e5c9f5656c7a46bcc5a1f00609c1170a3f9..f4f48223c71319eb1ca461986d6f81850b57b212 100644 --- a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,7 +14,6 @@ # License for the specific language governing permissions and limitations # under the License. - bash ./get_imagenette.sh export TF_CUDNN_USE_AUTOTUNE=1 @@ -43,7 +42,7 @@ done echo "## Training model: ${model}" -: ${BATCH_SIZE:=32} +: ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum @@ -111,6 +110,7 @@ function ctrl_c() { for pid in "${pid_list[@]}"; do echo "Killing pid ${pid}" kill ${pid} + wait ${pid} done exit 0 } diff --git a/cv/classification/vgg/tensorflow/get_num_devices.sh b/cv/classification/vgg/tensorflow/get_num_devices.sh index 14d6c0a5a7dc9a1b93e5b389d715c36aee2aa618..a9c3708955c785f98159cfa872157538f6aee1a8 100644 --- a/cv/classification/vgg/tensorflow/get_num_devices.sh +++ b/cv/classification/vgg/tensorflow/get_num_devices.sh @@ -1,4 +1,5 @@ -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +#!/bin/bash +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -18,8 +19,8 @@ if [ -n "$devices" ]; then _devices=(${devices//,/ }) num_devices=${#_devices[@]} else - num_devices=8 - export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + num_devices=2 + export CUDA_VISIBLE_DEVICES=0,1 echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}" fi -export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} \ No newline at end of file +export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} diff --git a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh index 36b41ee8530f24ce9301138963981817ecceb4de..c7642b98633c5e291901347fd34bef03758bd635 100644 --- a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh @@ -25,7 +25,8 @@ export TF_CPP_MIN_LOG_LEVEL=1 i=0 model="alexnet" -for arg in "$@"; do +for arg in "$@" +do if [ $i -eq 0 ]; then model=$arg let i++ @@ -40,11 +41,12 @@ for arg in "$@"; do done echo "## Training model: ${model}" + : ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/${model}_distributed" DATA_DIR=./imagenette @@ -56,7 +58,8 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi @@ -66,7 +69,7 @@ check_status() { # Prepare devices ################################################# devices=$CUDA_VISIBLE_DEVICES -if [ -n "$devices" ]; then +if [ -n "$devices" ]; then devices=(${devices//,/ }) num_devices=${#devices[@]} else @@ -86,7 +89,8 @@ fi ################################################# worker_hosts="" i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do if [ "$i" == "0" ]; then let i++ continue @@ -102,12 +106,12 @@ echo "worker_hosts: ${worker_hosts}" ################################################# trap ctrl_c INT function ctrl_c() { - echo "*** Trapped CTRL-C, killing process running background" - for pid in "${pid_list[@]}"; do - echo "Killing pid ${pid}" - kill ${pid} - done - exit 0 + echo "*** Trapped CTRL-C, killing process running background" + for pid in "${pid_list[@]}"; do + echo "Killing pid ${pid}" + kill ${pid} + done + exit 0 } ################################################# @@ -115,9 +119,10 @@ function ctrl_c() { ################################################# pid_list=() -last_device=$(expr ${num_devices} - 1) +last_device=`expr ${num_devices} - 1` i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do job_name="worker" if [ "${i}" == "0" ]; then job_name="ps" @@ -126,26 +131,30 @@ for device in "${devices[@]}"; do if [ ${i} -le 1 ]; then task_index=0 else - task_index=$(expr ${i} - 1) + task_index=`expr ${i} - 1` fi if [ "${i}" == "${last_device}" ]; then - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated \ - --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log - [[ ${PIPESTATUS[0]} == 0 ]] || exit + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated \ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}" else - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model}\ + --variable_update=distributed_replicated\ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}" fi let i++ diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh index aeea001bcade9882bd62a66aea792200898842e2..343dbe372b175d10e7a6c6855e3348125e0d117d 100644 --- a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: adam, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/vgg16" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -51,13 +53,15 @@ for arg in "$@"; do let i++ done -python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1 \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 \ - --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 \ - --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + exit ${EXIT_STATUS} diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh index 4de70ff725f1a22d56f1e8e80f9f3ca73d1b3fff..d4c2bcd2ab01f05ee348fbf32c330596dd1b4d88 100644 --- a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: adam, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/vgg16_multigpu" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -53,13 +55,15 @@ done source ./get_num_devices.sh -UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching \ - --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 \ - --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + exit ${EXIT_STATUS} diff --git a/nlp/language_model/bert/tensorflow/base/README.md b/nlp/language_model/bert/tensorflow/base/README.md index a77ae4cec2e61233508fcc956d835fad549a17a3..42ae860eb7efefe402a4340c77fcf4a0b3d281ab 100644 --- a/nlp/language_model/bert/tensorflow/base/README.md +++ b/nlp/language_model/bert/tensorflow/base/README.md @@ -1,4 +1,4 @@ -# BERT Pretraining + # BERT Pretraining ## Model description BERT, or Bidirectional Encoder Representations from Transformers, improves upon standard Transformers by removing the unidirectionality constraint by using a masked language model (MLM) pre-training objective. The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked word based only on its context. Unlike left-to-right language model pre-training, the MLM objective enables the representation to fuse the left and the right context, which allows us to pre-train a deep bidirectional Transformer. In addition to the masked language model, BERT uses a next sentence prediction task that jointly pre-trains text-pair representations. @@ -9,6 +9,12 @@ BERT, or Bidirectional Encoder Representations from Transformers, improves upon ```shell bash init_tf.sh +wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.7.tar.gz +tar xf openmpi-4.0.7.tar.gz +cd openmpi-4.0.7/ +./configure --prefix=/usr/local/bin --with-orte +make -j4 && make install +export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH ``` ### Download datasets @@ -34,16 +40,26 @@ tips: you can git clone this repo in other place ,we need the bert_pretrain_tf_r ### Training on single card ```shell -bash run_1card_FPS.sh +bash run_1card_FPS.sh --input_files_dir=/path/to/bert_pretrain_tf_records/train_data \ + --init_checkpoint=/path/to/bert_pretrain_ckpt_tf/model.ckpt-28252 \ + --eval_files_dir=/path/to/bert_pretrain_tf_records/eval_data \ + --train_batch_size=6 \ + --bert_config_file=/path/to/bert_pretrain_ckpt_tf/bert_config.json ``` ### Training on mutil-cards ```shell -bash run_multi_card_FPS.sh +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export IX_NUM_CUDA_VISIBLE_DEVICES=8 +bash run_multi_card_FPS.sh --input_files_dir=/path/to/bert_pretrain_tf_records/train_data \ + --init_checkpoint=/path/to/bert_pretrain_ckpt_tf/model.ckpt-28252 \ + --eval_files_dir=/path/to/bert_pretrain_tf_records/eval_data \ + --train_batch_size=6 \ + --bert_config_file=/path/to/bert_pretrain_ckpt_tf/bert_config.json ``` ## Result | | acc | fps | | --- | --- | --- | -| multi_card | 0.424126 | 0.267241| +| multi_card | 0.424126 | 0.267241| \ No newline at end of file diff --git a/nlp/language_model/bert/tensorflow/base/init_tf.sh b/nlp/language_model/bert/tensorflow/base/init_tf.sh index 08f27c35bf47bc85dcd18e0dc440c95ef74c6a57..79e2ae63b18d1065fa778e7102b86ea7afc22025 100644 --- a/nlp/language_model/bert/tensorflow/base/init_tf.sh +++ b/nlp/language_model/bert/tensorflow/base/init_tf.sh @@ -13,7 +13,8 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - +pip3 uninstall -y protobuf +pip3 install "protobuf<4.0.0" pip3 install git+https://github.com/mlperf/logging.git pip3 install git+https://github.com/NVIDIA/dllogger.git -pip3 install pandas==0.24 \ No newline at end of file +pip3 install pandas==1.3.5 \ No newline at end of file diff --git a/nlp/language_model/bert/tensorflow/base/optimization.py b/nlp/language_model/bert/tensorflow/base/optimization.py index f2e747c9c0482994b329df2686748820777635dc..f7aa9f49103d2bd1380d17ad7610d7a2b6d8c408 100644 --- a/nlp/language_model/bert/tensorflow/base/optimization.py +++ b/nlp/language_model/bert/tensorflow/base/optimization.py @@ -84,7 +84,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, manual_fp if hvd and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)): optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True) if use_fp16: - loss_scaler = tf.train.experimental.DynamicLossScale( + loss_scaler = tf.compat.v1.mixed_precision.DynamicLossScale( initial_loss_scale=init_loss_scale, increment_period=1000, multiplier=2.0) optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scaler) loss_scale_value = tf.identity(loss_scaler(), name="loss_scale") diff --git a/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh b/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh index 64b9094fbb788c5c33f51ca0c3309dee2c3ad3b1..683dc6696dcf899842c7fbf3e00196977bf3272c 100644 --- a/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh +++ b/nlp/language_model/bert/tensorflow/base/run_1card_FPS.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,7 +14,6 @@ # License for the specific language governing permissions and limitations # under the License. - set -x bash ./reset.sh @@ -34,33 +33,27 @@ fi date +%m%d%H%M%S >> ${LOG_DIR}/time.log CUDA_VISIBLE_DEVICES=0 python3 ./run_pretraining.py \ - --eval_files_dir=./bert_pretrain_tf_records/eval_data \ - --bert_config_file=./bert_pretrain_tf_ckpt/bert_config.json \ - --input_files_dir=./bert_pretrain_tf_records/train_data \ - --train_batch_size=6 \ - --init_checkpoint=./bert_pretrain_tf_ckpt/model.ckpt-28252 \ --output_dir=${OUTPUT_DIR} \ --do_train=True \ - --do_train=True \ - --do_eval=True \ + --do_eval=False \ --is_dist_eval_enabled=False \ --eval_batch_size=24 \ --max_eval_steps=100 \ --max_predictions_per_seq=76 \ --max_seq_length=512 \ - --num_train_steps=2000 \ - --num_accumulation_steps=4 \ + --num_train_steps=13206 \ + --num_accumulation_steps=1 \ --num_warmup_steps=0 \ - --save_checkpoints_steps=20000 \ + --save_checkpoints_steps=1000 \ --learning_rate=5e-5 \ - --horovod --amp --nouse_xla \ + --amp --nouse_xla \ --allreduce_post_accumulation=True \ - --enable_device_warmup=True \ + --enable_device_warmup=False \ --samples_between_eval=150000 \ --stop_threshold=0.72 \ - --samples_start_eval=100 \ + --samples_start_eval=3000000 \ --dllog_path=${OUTPUT_DIR}/bert_dllog.json "$@" rm -rf ${OUTPUT_DIR}/* -date +%m%d%H%M%S >> ${LOG_DIR}/time.log \ No newline at end of file +date +%m%d%H%M%S >> ${LOG_DIR}/time.log diff --git a/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh b/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh index 7ce2b86c69ab0c21679ea6347f3d270981e627c0..16087a784ed2dcfbf9ebfcb06b539f10e212d1e7 100644 --- a/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh +++ b/nlp/language_model/bert/tensorflow/base/run_multi_card_FPS.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,12 +14,11 @@ # License for the specific language governing permissions and limitations # under the License. - set -x : ${HOROVOD_RUN_ARGS:="--gloo"} -# bash ./reset.sh +bash ./reset.sh DATE=`date +%m%d%H%M%S` @@ -35,15 +34,8 @@ fi date +%m%d%H%M%S >> ${LOG_DIR}/time.log -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 - # Training phase -horovodrun -np 8 ${HOROVOD_RUN_ARGS} python3 ./run_pretraining.py \ - --eval_files_dir=./bert_pretrain_tf_records/eval_data \ - --bert_config_file=./bert_pretrain_tf_ckpt/bert_config.json \ - --input_files_dir=./bert_pretrain_tf_records/train_data \ - --train_batch_size=6 \ - --init_checkpoint=./bert_pretrain_tf_ckpt/model.ckpt-28252 \ +horovodrun -np ${IX_NUM_CUDA_VISIBLE_DEVICES} ${HOROVOD_RUN_ARGS} python3 ./run_pretraining.py \ --output_dir=${OUTPUT_DIR} \ --do_train=True \ --do_eval=True \ @@ -70,4 +62,4 @@ exit_code=$? rm -rf ${OUTPUT_DIR}/* date +%m%d%H%M%S >> ${LOG_DIR}/time.log -exit ${exit_code} \ No newline at end of file +exit ${exit_code} diff --git a/nlp/language_model/bert/tensorflow/base/run_pretraining.py b/nlp/language_model/bert/tensorflow/base/run_pretraining.py index e2078b962eba68eca012e76d2565a6127459ce49..8efd7b5509b04d87abcfa021ccc10ae82dc1db31 100644 --- a/nlp/language_model/bert/tensorflow/base/run_pretraining.py +++ b/nlp/language_model/bert/tensorflow/base/run_pretraining.py @@ -995,7 +995,7 @@ def main(_): ''' if FLAGS.do_eval: if FLAGS.horovod: - if hvd.rank() is not 0: + if hvd.rank() != 0: return converged = False num_steps_between_eval = math.ceil(FLAGS.samples_between_eval / global_batch_size)