From e96b3d7b03d66c2f0c3f390c3d03e05c41fc788f Mon Sep 17 00:00:00 2001 From: "jino.yang" Date: Wed, 12 Jun 2024 17:45:03 +0800 Subject: [PATCH] Sync model AlexNet (tensorflow) code. Signed-off-by: jino.yang --- .../alexnet/tensorflow/benchmark_cnn.py | 7 ++ ...un_train_alexnet_distributed_imagenette.sh | 17 +++++ .../run_train_alexnet_imagenette.sh | 67 ++++++++++++++++++ .../run_train_alexnet_multigpu_imagenette.sh | 69 +++++++++++++++++++ 4 files changed, 160 insertions(+) create mode 100644 cv/classification/alexnet/tensorflow/run_train_alexnet_distributed_imagenette.sh create mode 100644 cv/classification/alexnet/tensorflow/run_train_alexnet_imagenette.sh create mode 100644 cv/classification/alexnet/tensorflow/run_train_alexnet_multigpu_imagenette.sh diff --git a/cv/classification/alexnet/tensorflow/benchmark_cnn.py b/cv/classification/alexnet/tensorflow/benchmark_cnn.py index 6f65ea69b..3afda107c 100644 --- a/cv/classification/alexnet/tensorflow/benchmark_cnn.py +++ b/cv/classification/alexnet/tensorflow/benchmark_cnn.py @@ -1,4 +1,6 @@ # Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,9 +33,11 @@ import re import threading import time import traceback +import sys from absl import flags as absl_flags import numpy as np +import math import six from six.moves import xrange # pylint: disable=redefined-builtin @@ -881,6 +885,9 @@ def benchmark_one_step(sess, lossval = results['average_loss'] else: lossval = 0. + if not math.isfinite(lossval): + print("Loss is {}, stopping training".format(lossval)) + sys.exit(1) if image_producer is not None: image_producer.notify_image_consumption() train_time = time.time() - start_time diff --git a/cv/classification/alexnet/tensorflow/run_train_alexnet_distributed_imagenette.sh b/cv/classification/alexnet/tensorflow/run_train_alexnet_distributed_imagenette.sh new file mode 100644 index 000000000..89e9880f9 --- /dev/null +++ b/cv/classification/alexnet/tensorflow/run_train_alexnet_distributed_imagenette.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +bash ./run_train_distributed_imagenette.sh alexnet "$@" diff --git a/cv/classification/alexnet/tensorflow/run_train_alexnet_imagenette.sh b/cv/classification/alexnet/tensorflow/run_train_alexnet_imagenette.sh new file mode 100644 index 000000000..6a05462cb --- /dev/null +++ b/cv/classification/alexnet/tensorflow/run_train_alexnet_imagenette.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +bash ./get_imagenette.sh + +export TF_CUDNN_USE_AUTOTUNE=1 +export TF_CPP_MIN_LOG_LEVEL=1 + +: ${BATCH_SIZE:=32} +#TRAIN_EPOCHS=10 +# optional optimizer: momentum, rmsprop, momentum, sgd +OPTIMIZER=momentum +DATE=`date +%Y%m%d%H%M%S` + +LOG_DIR="logs/alexnet" +DATA_DIR=./imagenette +BASE_DIR=train_dir +TRAIN_DIR=${BASE_DIR}/alexnet + +mkdir -p ${LOG_DIR} +mkdir -p ${BASE_DIR} +rm -rf ${TRAIN_DIR} + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0)); then + EXIT_STATUS=1 + fi +} + +i=0 +for arg in "$@" +do + if [[ $arg =~ "--epoch" ]]; then + new_args[$i]="--num_epochs" + else + new_args[$i]=$arg + fi + let i++ +done + +python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=alexnet --optimizer=${OPTIMIZER} --num_gpus=1\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + + +exit ${EXIT_STATUS} diff --git a/cv/classification/alexnet/tensorflow/run_train_alexnet_multigpu_imagenette.sh b/cv/classification/alexnet/tensorflow/run_train_alexnet_multigpu_imagenette.sh new file mode 100644 index 000000000..b10133306 --- /dev/null +++ b/cv/classification/alexnet/tensorflow/run_train_alexnet_multigpu_imagenette.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +bash ./get_imagenette.sh + +export TF_CUDNN_USE_AUTOTUNE=1 +export TF_CPP_MIN_LOG_LEVEL=1 + +: ${BATCH_SIZE:=8} +#TRAIN_EPOCHS=10 +# optional optimizer: momentum, rmsprop, momentum, sgd +OPTIMIZER=momentum +DATE=`date +%Y%m%d%H%M%S` + +LOG_DIR="logs/alexnet_multigpu" +DATA_DIR=./imagenette +BASE_DIR=train_dir +TRAIN_DIR=${BASE_DIR}/alexnet_multigpu + +mkdir -p ${LOG_DIR} +mkdir -p ${BASE_DIR} +rm -rf ${TRAIN_DIR} + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0)); then + EXIT_STATUS=1 + fi +} + +i=0 +for arg in "$@" +do + if [[ $arg =~ "--epoch" ]]; then + new_args[$i]="--num_epochs" + else + new_args[$i]=$arg + fi + let i++ +done + +source ./get_num_devices.sh + +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=alexnet --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + + +exit ${EXIT_STATUS} -- Gitee