From e96b3d7b03d66c2f0c3f390c3d03e05c41fc788f Mon Sep 17 00:00:00 2001
From: "jino.yang" <ex.guoqi.yang@iluvatar.com>
Date: Wed, 12 Jun 2024 17:45:03 +0800
Subject: [PATCH] Sync model AlexNet (tensorflow) code.

Signed-off-by: jino.yang <ex.guoqi.yang@iluvatar.com>
---
 .../alexnet/tensorflow/benchmark_cnn.py       |  7 ++
 ...un_train_alexnet_distributed_imagenette.sh | 17 +++++
 .../run_train_alexnet_imagenette.sh           | 67 ++++++++++++++++++
 .../run_train_alexnet_multigpu_imagenette.sh  | 69 +++++++++++++++++++
 4 files changed, 160 insertions(+)
 create mode 100644 cv/classification/alexnet/tensorflow/run_train_alexnet_distributed_imagenette.sh
 create mode 100644 cv/classification/alexnet/tensorflow/run_train_alexnet_imagenette.sh
 create mode 100644 cv/classification/alexnet/tensorflow/run_train_alexnet_multigpu_imagenette.sh

diff --git a/cv/classification/alexnet/tensorflow/benchmark_cnn.py b/cv/classification/alexnet/tensorflow/benchmark_cnn.py
index 6f65ea69b..3afda107c 100644
--- a/cv/classification/alexnet/tensorflow/benchmark_cnn.py
+++ b/cv/classification/alexnet/tensorflow/benchmark_cnn.py
@@ -1,4 +1,6 @@
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,9 +33,11 @@ import re
 import threading
 import time
 import traceback
+import sys
 
 from absl import flags as absl_flags
 import numpy as np
+import math
 
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -881,6 +885,9 @@ def benchmark_one_step(sess,
     lossval = results['average_loss']
   else:
     lossval = 0.
+  if not math.isfinite(lossval):
+    print("Loss is {}, stopping training".format(lossval))
+    sys.exit(1)
   if image_producer is not None:
     image_producer.notify_image_consumption()
   train_time = time.time() - start_time
diff --git a/cv/classification/alexnet/tensorflow/run_train_alexnet_distributed_imagenette.sh b/cv/classification/alexnet/tensorflow/run_train_alexnet_distributed_imagenette.sh
new file mode 100644
index 000000000..89e9880f9
--- /dev/null
+++ b/cv/classification/alexnet/tensorflow/run_train_alexnet_distributed_imagenette.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+bash ./run_train_distributed_imagenette.sh alexnet "$@"
diff --git a/cv/classification/alexnet/tensorflow/run_train_alexnet_imagenette.sh b/cv/classification/alexnet/tensorflow/run_train_alexnet_imagenette.sh
new file mode 100644
index 000000000..6a05462cb
--- /dev/null
+++ b/cv/classification/alexnet/tensorflow/run_train_alexnet_imagenette.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+bash ./get_imagenette.sh
+
+export TF_CUDNN_USE_AUTOTUNE=1
+export TF_CPP_MIN_LOG_LEVEL=1
+
+: ${BATCH_SIZE:=32}
+#TRAIN_EPOCHS=10
+# optional optimizer: momentum, rmsprop, momentum, sgd
+OPTIMIZER=momentum
+DATE=`date +%Y%m%d%H%M%S`
+
+LOG_DIR="logs/alexnet"
+DATA_DIR=./imagenette
+BASE_DIR=train_dir
+TRAIN_DIR=${BASE_DIR}/alexnet
+
+mkdir -p ${LOG_DIR}
+mkdir -p ${BASE_DIR}
+rm -rf ${TRAIN_DIR}
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+i=0
+for arg in "$@"
+do
+    if [[ $arg =~ "--epoch" ]]; then
+        new_args[$i]="--num_epochs"
+    else
+        new_args[$i]=$arg
+    fi
+    let i++
+done
+
+python3 -u tf_cnn_benchmarks.py\
+ --data_name=imagenette --data_dir=${DATA_DIR}\
+ --data_format=NCHW --batch_size=${BATCH_SIZE}\
+ --model=alexnet --optimizer=${OPTIMIZER} --num_gpus=1\
+ --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
+ --eval_during_training_every_n_epochs=2\
+ --num_eval_epochs=1 --datasets_use_caching\
+ --stop_at_top_1_accuracy=0.9\
+ --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1  | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+
+
+exit ${EXIT_STATUS}
diff --git a/cv/classification/alexnet/tensorflow/run_train_alexnet_multigpu_imagenette.sh b/cv/classification/alexnet/tensorflow/run_train_alexnet_multigpu_imagenette.sh
new file mode 100644
index 000000000..b10133306
--- /dev/null
+++ b/cv/classification/alexnet/tensorflow/run_train_alexnet_multigpu_imagenette.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+bash ./get_imagenette.sh
+
+export TF_CUDNN_USE_AUTOTUNE=1
+export TF_CPP_MIN_LOG_LEVEL=1
+
+: ${BATCH_SIZE:=8}
+#TRAIN_EPOCHS=10
+# optional optimizer: momentum, rmsprop, momentum, sgd
+OPTIMIZER=momentum
+DATE=`date +%Y%m%d%H%M%S`
+
+LOG_DIR="logs/alexnet_multigpu"
+DATA_DIR=./imagenette
+BASE_DIR=train_dir
+TRAIN_DIR=${BASE_DIR}/alexnet_multigpu
+
+mkdir -p ${LOG_DIR}
+mkdir -p ${BASE_DIR}
+rm -rf ${TRAIN_DIR}
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+i=0
+for arg in "$@"
+do
+    if [[ $arg =~ "--epoch" ]]; then
+        new_args[$i]="--num_epochs"
+    else
+        new_args[$i]=$arg
+    fi
+    let i++
+done
+
+source ./get_num_devices.sh
+
+UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+ --data_name=imagenette --data_dir=${DATA_DIR}\
+ --data_format=NCHW --batch_size=${BATCH_SIZE}\
+ --model=alexnet --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\
+ --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
+ --eval_during_training_every_n_epochs=2\
+ --num_eval_epochs=1 --datasets_use_caching\
+ --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\
+ --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+
+
+exit ${EXIT_STATUS}
-- 
Gitee