From 34bb68cac4dbb862a647e91c9bfd018f0f80b464 Mon Sep 17 00:00:00 2001 From: "jino.yang" Date: Wed, 12 Jun 2024 18:17:52 +0800 Subject: [PATCH] Sync model VGG16(tensorflow) code Link#IA53LN Signed-off-by: jino.yang --- .../vgg/tensorflow/benchmark_cnn.py | 19 ++- .../run_train_distributed_imagenette.sh | 157 ++++++++++++++++++ .../run_train_vgg16_distributed_imagenette.sh | 18 ++ .../tensorflow/run_train_vgg16_imagenet.sh | 29 ++-- .../tensorflow/run_train_vgg16_imagenette.sh | 63 +++++++ .../run_train_vgg16_multigpu_imagenet.sh | 28 ++-- .../run_train_vgg16_multigpu_imagenette.sh | 65 ++++++++ 7 files changed, 339 insertions(+), 40 deletions(-) create mode 100644 cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh create mode 100644 cv/classification/vgg/tensorflow/run_train_vgg16_distributed_imagenette.sh create mode 100644 cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh create mode 100644 cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh diff --git a/cv/classification/vgg/tensorflow/benchmark_cnn.py b/cv/classification/vgg/tensorflow/benchmark_cnn.py index f089a8fbf..79edb4ab6 100644 --- a/cv/classification/vgg/tensorflow/benchmark_cnn.py +++ b/cv/classification/vgg/tensorflow/benchmark_cnn.py @@ -1,7 +1,8 @@ # Copyright 2017 The TensorFlow Authors. All Rights Reserved. # -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -33,9 +34,11 @@ import re import threading import time import traceback +import sys from absl import flags as absl_flags import numpy as np +import math import six from six.moves import xrange # pylint: disable=redefined-builtin @@ -170,7 +173,7 @@ flags.DEFINE_integer('num_batches', None, 'number of batches to run, excluding ' flags.DEFINE_integer('num_eval_batches', None, 'number of eval batches to run, excluding warmup. ' 'Defaults to --num_batches') -flags.DEFINE_float('num_epochs', 100, +flags.DEFINE_float('num_epochs', 90, 'number of epochs to run, excluding warmup. ' 'This and --num_batches cannot both be specified.') flags.DEFINE_float('num_eval_epochs', None, @@ -322,7 +325,7 @@ flags.DEFINE_enum('optimizer', 'sgd', ('momentum', 'sgd', 'rmsprop', 'adam'), 'Optimizer to use') flags.DEFINE_float('init_learning_rate', None, 'Initial learning rate for training.') -flags.DEFINE_string('piecewise_learning_rate_schedule', "0.01;30;0.001;60;0.0001;90;0.00001", +flags.DEFINE_string('piecewise_learning_rate_schedule', None, 'Specifies a piecewise learning rate schedule based on the ' 'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, ' 'where each LRi is a learning rate and each Ei is an epoch ' @@ -331,10 +334,10 @@ flags.DEFINE_string('piecewise_learning_rate_schedule', "0.01;30;0.001;60;0.0001 'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 ' 'for the first 10 epochs, then is 0.2 for the next 15 ' 'epochs, then is 0.1 until training ends.') -flags.DEFINE_float('num_epochs_per_decay', None, +flags.DEFINE_float('num_epochs_per_decay', 0, 'Steps after which learning rate decays. If 0, the learning ' 'rate does not decay.') -flags.DEFINE_float('learning_rate_decay_factor', None, +flags.DEFINE_float('learning_rate_decay_factor', 0, 'Learning rate decay factor. Decay by this factor every ' '`num_epochs_per_decay` epochs. If 0, learning rate does ' 'not decay.') @@ -358,7 +361,7 @@ flags.DEFINE_float('adam_beta2', 0.999, 'Beta2 term for the Adam optimizer') flags.DEFINE_float('adam_epsilon', 1e-8, 'Epsilon term for the Adam optimizer') flags.DEFINE_float('gradient_clip', None, 'Gradient clipping magnitude. Disabled by default.') -flags.DEFINE_float('weight_decay', 0.0001, +flags.DEFINE_float('weight_decay', 0.00004, 'Weight decay factor for training.') flags.DEFINE_float('gpu_memory_frac_for_testing', 0, 'If non-zero, the fraction of GPU memory that will be used. ' @@ -883,6 +886,9 @@ def benchmark_one_step(sess, lossval = results['average_loss'] else: lossval = 0. + if not math.isfinite(lossval): + print("Loss is {}, stopping training".format(lossval)) + sys.exit(1) if image_producer is not None: image_producer.notify_image_consumption() train_time = time.time() - start_time @@ -1196,7 +1202,6 @@ def get_learning_rate(params, global_step, num_examples_per_epoch, model, learning_rate = get_piecewise_learning_rate( params.piecewise_learning_rate_schedule, global_step, num_batches_per_epoch) - print("learning_rate",learning_rate) elif params.init_learning_rate is not None: learning_rate = params.init_learning_rate if (params.num_epochs_per_decay > 0 and diff --git a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh new file mode 100644 index 000000000..36b41ee85 --- /dev/null +++ b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +bash ./get_imagenette.sh + +export TF_CUDNN_USE_AUTOTUNE=1 +export TF_CPP_MIN_LOG_LEVEL=1 + +################################################# +# Prepare training arguments +################################################# + +i=0 +model="alexnet" +for arg in "$@"; do + if [ $i -eq 0 ]; then + model=$arg + let i++ + continue + fi + if [[ $arg =~ "--epoch" ]]; then + new_args[$i]="--num_epochs" + else + new_args[$i]=$arg + fi + let i++ +done +echo "## Training model: ${model}" + +: ${BATCH_SIZE:=16} +# TRAIN_EPOCHS=10 +# optional optimizer: momentum, rmsprop, momentum, sgd +OPTIMIZER=momentum +DATE=$(date +%Y%m%d%H%M%S) + +LOG_DIR="logs/${model}_distributed" +DATA_DIR=./imagenette +BASE_DIR=train_dir +TRAIN_DIR=${BASE_DIR}/${model}_distributed + +mkdir -p ${LOG_DIR} +mkdir -p ${BASE_DIR} +rm -rf ${TRAIN_DIR} + +EXIT_STATUS=0 +check_status() { + if ((${PIPESTATUS[0]} != 0)); then + EXIT_STATUS=1 + fi +} + +################################################# +# Prepare devices +################################################# +devices=$CUDA_VISIBLE_DEVICES +if [ -n "$devices" ]; then + devices=(${devices//,/ }) + num_devices=${#devices[@]} +else + devices=(0 1) + num_devices=2 +fi +echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" +echo "num_devices: ${num_devices}" + +if [ "${num_devices}" == "1" ]; then + echo "Error: The number of devices must be greater then 1 for distributed training, but got CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}." + exit 0 +fi + +################################################# +# Prepare distributed training arguments +################################################# +worker_hosts="" +i=0 +for device in "${devices[@]}"; do + if [ "$i" == "0" ]; then + let i++ + continue + fi + let i++ + worker_hosts="${worker_hosts},127.0.0.1:5000${device}" +done +worker_hosts=${worker_hosts#*,} +echo "worker_hosts: ${worker_hosts}" + +################################################# +# Handle CTRL-C +################################################# +trap ctrl_c INT +function ctrl_c() { + echo "*** Trapped CTRL-C, killing process running background" + for pid in "${pid_list[@]}"; do + echo "Killing pid ${pid}" + kill ${pid} + done + exit 0 +} + +################################################# +# Start distributed training +################################################# + +pid_list=() +last_device=$(expr ${num_devices} - 1) +i=0 +for device in "${devices[@]}"; do + job_name="worker" + if [ "${i}" == "0" ]; then + job_name="ps" + fi + + if [ ${i} -le 1 ]; then + task_index=0 + else + task_index=$(expr ${i} - 1) + fi + + if [ "${i}" == "${last_device}" ]; then + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated \ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log + [[ ${PIPESTATUS[0]} == 0 ]] || exit + echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}" + else + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & + echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}" + fi + let i++ + pid_list+=($!) +done + +echo "All subprocess: ${pid_list[*]}" +ctrl_c +exit ${EXIT_STATUS} diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_distributed_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_distributed_imagenette.sh new file mode 100644 index 000000000..81d94a9a3 --- /dev/null +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_distributed_imagenette.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +bash ./run_train_distributed_imagenette.sh vgg16 "$@" +exit $? diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenet.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenet.sh index b6b1946ed..342a88f67 100644 --- a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenet.sh +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenet.sh @@ -14,8 +14,6 @@ # License for the specific language governing permissions and limitations # under the License. - - export TF_CUDNN_USE_AUTOTUNE=1 export TF_CPP_MIN_LOG_LEVEL=1 @@ -23,7 +21,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: adam, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=`date +%Y%m%d%H%M%S` +DATE=$(date +%Y%m%d%H%M%S) LOG_DIR="logs/vgg16" DATA_DIR=./imagenet_tfrecord @@ -35,16 +33,14 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() -{ +check_status() { if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@" -do +for arg in "$@"; do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -53,15 +49,14 @@ do let i++ done -python3 -u tf_cnn_benchmarks.py\ - --data_name=imagenet --data_dir=${DATA_DIR}\ - --data_format=NCHW --batch_size=${BATCH_SIZE}\ - --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1\ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ - --eval_during_training_every_n_epochs=2\ - --num_eval_epochs=1 --datasets_use_caching\ - --stop_at_top_1_accuracy=0.9\ - --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit - +python3 -u tf_cnn_benchmarks.py --data_name=imagenet --data_dir=${DATA_DIR} \ + --data_format=NCHW --batch_size=${BATCH_SIZE} \ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1 \ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ + --eval_during_training_every_n_epochs=2 \ + --num_eval_epochs=1 --datasets_use_caching \ + --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 \ + --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log +[[ ${PIPESTATUS[0]} == 0 ]] || exit exit ${EXIT_STATUS} diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh new file mode 100644 index 000000000..aeea001bc --- /dev/null +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +bash ./get_imagenette.sh + +export TF_CUDNN_USE_AUTOTUNE=1 +export TF_CPP_MIN_LOG_LEVEL=1 + +: ${BATCH_SIZE:=32} +#TRAIN_EPOCHS=10 +# optional optimizer: adam, rmsprop, momentum, sgd +OPTIMIZER=momentum +DATE=$(date +%Y%m%d%H%M%S) + +LOG_DIR="logs/vgg16" +DATA_DIR=./imagenette +BASE_DIR=train_dir +TRAIN_DIR=${BASE_DIR}/vgg16 + +mkdir -p ${LOG_DIR} +mkdir -p ${BASE_DIR} +rm -rf ${TRAIN_DIR} + +EXIT_STATUS=0 +check_status() { + if ((${PIPESTATUS[0]} != 0)); then + EXIT_STATUS=1 + fi +} + +i=0 +for arg in "$@"; do + if [[ $arg =~ "--epoch" ]]; then + new_args[$i]="--num_epochs" + else + new_args[$i]=$arg + fi + let i++ +done + +python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ + --data_format=NCHW --batch_size=${BATCH_SIZE} \ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1 \ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ + --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 \ + --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 \ + --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log +[[ ${PIPESTATUS[0]} == 0 ]] || exit + +exit ${EXIT_STATUS} diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenet.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenet.sh index c68e44371..56d6a5c00 100644 --- a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenet.sh +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenet.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: adam, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=`date +%Y%m%d%H%M%S` +DATE=$(date +%Y%m%d%H%M%S) LOG_DIR="logs/vgg16_multigpu" DATA_DIR=./imagenet_tfrecord @@ -35,16 +35,14 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() -{ +check_status() { if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@" -do +for arg in "$@"; do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -55,15 +53,13 @@ done source ./get_num_devices.sh -UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ - --data_name=imagenet --data_dir=${DATA_DIR}\ - --data_format=NCHW --batch_size=${BATCH_SIZE}\ - --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ - --eval_during_training_every_n_epochs=2\ - --num_eval_epochs=1 --datasets_use_caching\ - --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ - --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenet --data_dir=${DATA_DIR} \ + --data_format=NCHW --batch_size=${BATCH_SIZE} \ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ + --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching \ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 \ + --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log +[[ ${PIPESTATUS[0]} == 0 ]] || exit - -exit ${EXIT_STATUS} \ No newline at end of file +exit ${EXIT_STATUS} diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh new file mode 100644 index 000000000..4de70ff72 --- /dev/null +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +bash ./get_imagenette.sh + +export TF_CUDNN_USE_AUTOTUNE=1 +export TF_CPP_MIN_LOG_LEVEL=1 + +: ${BATCH_SIZE:=32} +#TRAIN_EPOCHS=10 +# optional optimizer: adam, rmsprop, momentum, sgd +OPTIMIZER=momentum +DATE=$(date +%Y%m%d%H%M%S) + +LOG_DIR="logs/vgg16_multigpu" +DATA_DIR=./imagenette +BASE_DIR=train_dir +TRAIN_DIR=${BASE_DIR}/vgg16_multigpu + +mkdir -p ${LOG_DIR} +mkdir -p ${BASE_DIR} +rm -rf ${TRAIN_DIR} + +EXIT_STATUS=0 +check_status() { + if ((${PIPESTATUS[0]} != 0)); then + EXIT_STATUS=1 + fi +} + +i=0 +for arg in "$@"; do + if [[ $arg =~ "--epoch" ]]; then + new_args[$i]="--num_epochs" + else + new_args[$i]=$arg + fi + let i++ +done + +source ./get_num_devices.sh + +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ + --data_format=NCHW --batch_size=${BATCH_SIZE} \ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ + --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching \ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 \ + --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log +[[ ${PIPESTATUS[0]} == 0 ]] || exit + +exit ${EXIT_STATUS} -- Gitee