From 34bb68cac4dbb862a647e91c9bfd018f0f80b464 Mon Sep 17 00:00:00 2001
From: "jino.yang" <ex.guoqi.yang@iluvatar.com>
Date: Wed, 12 Jun 2024 18:17:52 +0800
Subject: [PATCH] Sync model VGG16(tensorflow) code

    Link#IA53LN

Signed-off-by: jino.yang <ex.guoqi.yang@iluvatar.com>
---
 .../vgg/tensorflow/benchmark_cnn.py           |  19 ++-
 .../run_train_distributed_imagenette.sh       | 157 ++++++++++++++++++
 .../run_train_vgg16_distributed_imagenette.sh |  18 ++
 .../tensorflow/run_train_vgg16_imagenet.sh    |  29 ++--
 .../tensorflow/run_train_vgg16_imagenette.sh  |  63 +++++++
 .../run_train_vgg16_multigpu_imagenet.sh      |  28 ++--
 .../run_train_vgg16_multigpu_imagenette.sh    |  65 ++++++++
 7 files changed, 339 insertions(+), 40 deletions(-)
 create mode 100644 cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh
 create mode 100644 cv/classification/vgg/tensorflow/run_train_vgg16_distributed_imagenette.sh
 create mode 100644 cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh
 create mode 100644 cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh

diff --git a/cv/classification/vgg/tensorflow/benchmark_cnn.py b/cv/classification/vgg/tensorflow/benchmark_cnn.py
index f089a8fbf..79edb4ab6 100644
--- a/cv/classification/vgg/tensorflow/benchmark_cnn.py
+++ b/cv/classification/vgg/tensorflow/benchmark_cnn.py
@@ -1,7 +1,8 @@
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
-# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -33,9 +34,11 @@ import re
 import threading
 import time
 import traceback
+import sys
 
 from absl import flags as absl_flags
 import numpy as np
+import math
 
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -170,7 +173,7 @@ flags.DEFINE_integer('num_batches', None, 'number of batches to run, excluding '
 flags.DEFINE_integer('num_eval_batches', None,
                      'number of eval batches to run, excluding warmup. '
                      'Defaults to --num_batches')
-flags.DEFINE_float('num_epochs', 100,
+flags.DEFINE_float('num_epochs', 90,
                    'number of epochs to run, excluding warmup. '
                    'This and --num_batches cannot both be specified.')
 flags.DEFINE_float('num_eval_epochs', None,
@@ -322,7 +325,7 @@ flags.DEFINE_enum('optimizer', 'sgd', ('momentum', 'sgd', 'rmsprop', 'adam'),
                   'Optimizer to use')
 flags.DEFINE_float('init_learning_rate', None,
                    'Initial learning rate for training.')
-flags.DEFINE_string('piecewise_learning_rate_schedule', "0.01;30;0.001;60;0.0001;90;0.00001",
+flags.DEFINE_string('piecewise_learning_rate_schedule', None,
                     'Specifies a piecewise learning rate schedule based on the '
                     'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, '
                     'where each LRi is a learning rate and each Ei is an epoch '
@@ -331,10 +334,10 @@ flags.DEFINE_string('piecewise_learning_rate_schedule', "0.01;30;0.001;60;0.0001
                     'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 '
                     'for the first 10 epochs, then is 0.2 for the next 15 '
                     'epochs, then is 0.1 until training ends.')
-flags.DEFINE_float('num_epochs_per_decay', None,
+flags.DEFINE_float('num_epochs_per_decay', 0,
                    'Steps after which learning rate decays. If 0, the learning '
                    'rate does not decay.')
-flags.DEFINE_float('learning_rate_decay_factor', None,
+flags.DEFINE_float('learning_rate_decay_factor', 0,
                    'Learning rate decay factor. Decay by this factor every '
                    '`num_epochs_per_decay` epochs. If 0, learning rate does '
                    'not decay.')
@@ -358,7 +361,7 @@ flags.DEFINE_float('adam_beta2', 0.999, 'Beta2 term for the Adam optimizer')
 flags.DEFINE_float('adam_epsilon', 1e-8, 'Epsilon term for the Adam optimizer')
 flags.DEFINE_float('gradient_clip', None,
                    'Gradient clipping magnitude. Disabled by default.')
-flags.DEFINE_float('weight_decay', 0.0001,
+flags.DEFINE_float('weight_decay', 0.00004,
                    'Weight decay factor for training.')
 flags.DEFINE_float('gpu_memory_frac_for_testing', 0,
                    'If non-zero, the fraction of GPU memory that will be used. '
@@ -883,6 +886,9 @@ def benchmark_one_step(sess,
     lossval = results['average_loss']
   else:
     lossval = 0.
+  if not math.isfinite(lossval):
+    print("Loss is {}, stopping training".format(lossval))
+    sys.exit(1)
   if image_producer is not None:
     image_producer.notify_image_consumption()
   train_time = time.time() - start_time
@@ -1196,7 +1202,6 @@ def get_learning_rate(params, global_step, num_examples_per_epoch, model,
       learning_rate = get_piecewise_learning_rate(
           params.piecewise_learning_rate_schedule,
           global_step, num_batches_per_epoch)
-      print("learning_rate",learning_rate)
     elif params.init_learning_rate is not None:
       learning_rate = params.init_learning_rate
       if (params.num_epochs_per_decay > 0 and
diff --git a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh
new file mode 100644
index 000000000..36b41ee85
--- /dev/null
+++ b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+bash ./get_imagenette.sh
+
+export TF_CUDNN_USE_AUTOTUNE=1
+export TF_CPP_MIN_LOG_LEVEL=1
+
+#################################################
+# Prepare training arguments
+#################################################
+
+i=0
+model="alexnet"
+for arg in "$@"; do
+    if [ $i -eq 0 ]; then
+        model=$arg
+        let i++
+        continue
+    fi
+    if [[ $arg =~ "--epoch" ]]; then
+        new_args[$i]="--num_epochs"
+    else
+        new_args[$i]=$arg
+    fi
+    let i++
+done
+echo "## Training model: ${model}"
+
+: ${BATCH_SIZE:=16}
+# TRAIN_EPOCHS=10
+# optional optimizer: momentum, rmsprop, momentum, sgd
+OPTIMIZER=momentum
+DATE=$(date +%Y%m%d%H%M%S)
+
+LOG_DIR="logs/${model}_distributed"
+DATA_DIR=./imagenette
+BASE_DIR=train_dir
+TRAIN_DIR=${BASE_DIR}/${model}_distributed
+
+mkdir -p ${LOG_DIR}
+mkdir -p ${BASE_DIR}
+rm -rf ${TRAIN_DIR}
+
+EXIT_STATUS=0
+check_status() {
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+#################################################
+# Prepare devices
+#################################################
+devices=$CUDA_VISIBLE_DEVICES
+if [ -n "$devices" ]; then
+    devices=(${devices//,/ })
+    num_devices=${#devices[@]}
+else
+    devices=(0 1)
+    num_devices=2
+fi
+echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
+echo "num_devices: ${num_devices}"
+
+if [ "${num_devices}" == "1" ]; then
+    echo "Error: The number of devices must be greater then 1 for distributed training, but got CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}."
+    exit 0
+fi
+
+#################################################
+# Prepare distributed training arguments
+#################################################
+worker_hosts=""
+i=0
+for device in "${devices[@]}"; do
+    if [ "$i" == "0" ]; then
+        let i++
+        continue
+    fi
+    let i++
+    worker_hosts="${worker_hosts},127.0.0.1:5000${device}"
+done
+worker_hosts=${worker_hosts#*,}
+echo "worker_hosts: ${worker_hosts}"
+
+#################################################
+# Handle CTRL-C
+#################################################
+trap ctrl_c INT
+function ctrl_c() {
+    echo "*** Trapped CTRL-C, killing process running background"
+    for pid in "${pid_list[@]}"; do
+        echo "Killing pid ${pid}"
+        kill ${pid}
+    done
+    exit 0
+}
+
+#################################################
+# Start distributed training
+#################################################
+
+pid_list=()
+last_device=$(expr ${num_devices} - 1)
+i=0
+for device in "${devices[@]}"; do
+    job_name="worker"
+    if [ "${i}" == "0" ]; then
+        job_name="ps"
+    fi
+
+    if [ ${i} -le 1 ]; then
+        task_index=0
+    else
+        task_index=$(expr ${i} - 1)
+    fi
+
+    if [ "${i}" == "${last_device}" ]; then
+        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
+            --data_format=NCHW \
+            --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \
+            --batch_size=${BATCH_SIZE} --model=${model} \
+            --variable_update=distributed_replicated \
+            --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \
+            --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
+        [[ ${PIPESTATUS[0]} == 0 ]] || exit
+        echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}"
+    else
+        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
+            --data_format=NCHW \
+            --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \
+            --batch_size=${BATCH_SIZE} --model=${model} \
+            --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \
+            --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" &
+        echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}"
+    fi
+    let i++
+    pid_list+=($!)
+done
+
+echo "All subprocess: ${pid_list[*]}"
+ctrl_c
+exit ${EXIT_STATUS}
diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_distributed_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_distributed_imagenette.sh
new file mode 100644
index 000000000..81d94a9a3
--- /dev/null
+++ b/cv/classification/vgg/tensorflow/run_train_vgg16_distributed_imagenette.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+bash ./run_train_distributed_imagenette.sh vgg16 "$@"
+exit $?
diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenet.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenet.sh
index b6b1946ed..342a88f67 100644
--- a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenet.sh
+++ b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenet.sh
@@ -14,8 +14,6 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-
-
 export TF_CUDNN_USE_AUTOTUNE=1
 export TF_CPP_MIN_LOG_LEVEL=1
 
@@ -23,7 +21,7 @@ export TF_CPP_MIN_LOG_LEVEL=1
 #TRAIN_EPOCHS=10
 # optional optimizer: adam, rmsprop, momentum, sgd
 OPTIMIZER=momentum
-DATE=`date +%Y%m%d%H%M%S`
+DATE=$(date +%Y%m%d%H%M%S)
 
 LOG_DIR="logs/vgg16"
 DATA_DIR=./imagenet_tfrecord
@@ -35,16 +33,14 @@ mkdir -p ${BASE_DIR}
 rm -rf ${TRAIN_DIR}
 
 EXIT_STATUS=0
-check_status()
-{
+check_status() {
     if ((${PIPESTATUS[0]} != 0)); then
         EXIT_STATUS=1
     fi
 }
 
 i=0
-for arg in "$@"
-do
+for arg in "$@"; do
     if [[ $arg =~ "--epoch" ]]; then
         new_args[$i]="--num_epochs"
     else
@@ -53,15 +49,14 @@ do
     let i++
 done
 
-python3 -u tf_cnn_benchmarks.py\
- --data_name=imagenet --data_dir=${DATA_DIR}\
- --data_format=NCHW --batch_size=${BATCH_SIZE}\
- --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1\
- --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
- --eval_during_training_every_n_epochs=2\
- --num_eval_epochs=1 --datasets_use_caching\
- --stop_at_top_1_accuracy=0.9\
- --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
-
+python3 -u tf_cnn_benchmarks.py --data_name=imagenet --data_dir=${DATA_DIR} \
+    --data_format=NCHW --batch_size=${BATCH_SIZE} \
+    --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1 \
+    --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \
+    --eval_during_training_every_n_epochs=2 \
+    --num_eval_epochs=1 --datasets_use_caching \
+    --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 \
+    --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
+[[ ${PIPESTATUS[0]} == 0 ]] || exit
 
 exit ${EXIT_STATUS}
diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh
new file mode 100644
index 000000000..aeea001bc
--- /dev/null
+++ b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+bash ./get_imagenette.sh
+
+export TF_CUDNN_USE_AUTOTUNE=1
+export TF_CPP_MIN_LOG_LEVEL=1
+
+: ${BATCH_SIZE:=32}
+#TRAIN_EPOCHS=10
+# optional optimizer: adam, rmsprop, momentum, sgd
+OPTIMIZER=momentum
+DATE=$(date +%Y%m%d%H%M%S)
+
+LOG_DIR="logs/vgg16"
+DATA_DIR=./imagenette
+BASE_DIR=train_dir
+TRAIN_DIR=${BASE_DIR}/vgg16
+
+mkdir -p ${LOG_DIR}
+mkdir -p ${BASE_DIR}
+rm -rf ${TRAIN_DIR}
+
+EXIT_STATUS=0
+check_status() {
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+i=0
+for arg in "$@"; do
+    if [[ $arg =~ "--epoch" ]]; then
+        new_args[$i]="--num_epochs"
+    else
+        new_args[$i]=$arg
+    fi
+    let i++
+done
+
+python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
+    --data_format=NCHW --batch_size=${BATCH_SIZE} \
+    --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1 \
+    --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \
+    --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 \
+    --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 \
+    --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
+[[ ${PIPESTATUS[0]} == 0 ]] || exit
+
+exit ${EXIT_STATUS}
diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenet.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenet.sh
index c68e44371..56d6a5c00 100644
--- a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenet.sh
+++ b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenet.sh
@@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1
 #TRAIN_EPOCHS=10
 # optional optimizer: adam, rmsprop, momentum, sgd
 OPTIMIZER=momentum
-DATE=`date +%Y%m%d%H%M%S`
+DATE=$(date +%Y%m%d%H%M%S)
 
 LOG_DIR="logs/vgg16_multigpu"
 DATA_DIR=./imagenet_tfrecord
@@ -35,16 +35,14 @@ mkdir -p ${BASE_DIR}
 rm -rf ${TRAIN_DIR}
 
 EXIT_STATUS=0
-check_status()
-{
+check_status() {
     if ((${PIPESTATUS[0]} != 0)); then
         EXIT_STATUS=1
     fi
 }
 
 i=0
-for arg in "$@"
-do
+for arg in "$@"; do
     if [[ $arg =~ "--epoch" ]]; then
         new_args[$i]="--num_epochs"
     else
@@ -55,15 +53,13 @@ done
 
 source ./get_num_devices.sh
 
-UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
- --data_name=imagenet --data_dir=${DATA_DIR}\
- --data_format=NCHW --batch_size=${BATCH_SIZE}\
- --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\
- --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
- --eval_during_training_every_n_epochs=2\
- --num_eval_epochs=1 --datasets_use_caching\
- --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\
- --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenet --data_dir=${DATA_DIR} \
+    --data_format=NCHW --batch_size=${BATCH_SIZE} \
+    --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \
+    --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \
+    --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching \
+    --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 \
+    --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
+[[ ${PIPESTATUS[0]} == 0 ]] || exit
 
-
-exit ${EXIT_STATUS}
\ No newline at end of file
+exit ${EXIT_STATUS}
diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh
new file mode 100644
index 000000000..4de70ff72
--- /dev/null
+++ b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+bash ./get_imagenette.sh
+
+export TF_CUDNN_USE_AUTOTUNE=1
+export TF_CPP_MIN_LOG_LEVEL=1
+
+: ${BATCH_SIZE:=32}
+#TRAIN_EPOCHS=10
+# optional optimizer: adam, rmsprop, momentum, sgd
+OPTIMIZER=momentum
+DATE=$(date +%Y%m%d%H%M%S)
+
+LOG_DIR="logs/vgg16_multigpu"
+DATA_DIR=./imagenette
+BASE_DIR=train_dir
+TRAIN_DIR=${BASE_DIR}/vgg16_multigpu
+
+mkdir -p ${LOG_DIR}
+mkdir -p ${BASE_DIR}
+rm -rf ${TRAIN_DIR}
+
+EXIT_STATUS=0
+check_status() {
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+i=0
+for arg in "$@"; do
+    if [[ $arg =~ "--epoch" ]]; then
+        new_args[$i]="--num_epochs"
+    else
+        new_args[$i]=$arg
+    fi
+    let i++
+done
+
+source ./get_num_devices.sh
+
+UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \
+    --data_format=NCHW --batch_size=${BATCH_SIZE} \
+    --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \
+    --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \
+    --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching \
+    --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 \
+    --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log
+[[ ${PIPESTATUS[0]} == 0 ]] || exit
+
+exit ${EXIT_STATUS}
-- 
Gitee