diff --git a/cv/classification/resnet50/tensorflow/README.md b/cv/classification/resnet50/tensorflow/README.md new file mode 100644 index 0000000000000000000000000000000000000000..66083d9525952a2446fb9c0a4baea5842808d16a --- /dev/null +++ b/cv/classification/resnet50/tensorflow/README.md @@ -0,0 +1,37 @@ + +## Prepare + +### Install packages + +```shell +pip3 install absl-py git+https://github.com/NVIDIA/dllogger#egg=dllogger +``` + +### Download datasets + + +[Downloading and converting to TFRecord format](https://github.com/kmonachopoulos/ImageNet-to-TFrecord) or +[here](https://github.com/tensorflow/models/tree/master/research/slim#downloading-and-converting-to-tfrecord-format) +make a file named imagenet_tfrecord, and store imagenet datasest convert to imagenet_tfrecord + + + +## Training + +### Training on single card + +```shell +bash run_train_resnet50_imagenette.sh +``` + +### Training on mutil-cards +```shell +bash run_train_resnet50_multigpu_imagenette.sh +``` + + +## Result + +| | acc | fps | +| --- | --- | --- | +| multi_card | 0.9860 | 236.9 | \ No newline at end of file diff --git a/cv/classification/resnet50/tensorflow/README_origin.md b/cv/classification/resnet50/tensorflow/README_origin.md new file mode 100644 index 0000000000000000000000000000000000000000..e7b746487bcf0daad38d4522580a170ac58523f2 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/README_origin.md @@ -0,0 +1,88 @@ +# tf_cnn_benchmarks: High performance benchmarks + +**Note: tf_cnn_benchmarks is no longer maintained.** + +tf_cnn_benchmarks contains TensorFlow 1 implementations of several popular +convolutional models, and is designed to be as fast as possible. +tf_cnn_benchmarks supports both running on a single machine or running in +distributed mode across multiple hosts. + +tf_cnn_benchmarks is no longer maintained. Although it will run with TensorFlow +2, it was written and optimized for TensorFlow 1, and has not been maintained +since TensorFlow 2 was released. For clean and easy-to-read TensorFlow 2 models, +please see the [TensorFlow Official +Models](https://github.com/tensorflow/models/tree/master/official). + +## Getting Started + +To run ResNet50 with synthetic data without distortions with a single GPU, run + +``` +python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server +``` + +Note that the master branch of tf_cnn_benchmarks occasionally requires the +latest nightly version of TensorFlow. You can install the nightly version by +running `pip install tf-nightly-gpu` in a clean environment, or by installing +TensorFlow from source. We sometimes will create a branch of tf_cnn_benchmarks, +in the form of cnn_tf_vX.Y_compatible, that is compatible with TensorFlow +version X.Y. For example, branch +[cnn_tf_v1.9_compatible](https://github.com/tensorflow/benchmarks/tree/cnn_tf_v1.9_compatible/scripts/tf_cnn_benchmarks) +works with TensorFlow 1.9. However, as tf_cnn_benchmarks is no longer +maintained, we will likely no longer create new branches. + +Some important flags are + +* model: Model to use, e.g. resnet50, inception3, vgg16, and alexnet. +* num_gpus: Number of GPUs to use. +* data_dir: Path to data to process. If not set, synthetic data is used. To + use Imagenet data use these + [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started) + as a starting point. +* batch_size: Batch size for each GPU. +* variable_update: The method for managing variables: parameter_server + ,replicated, distributed_replicated, independent +* local_parameter_device: Device to use as parameter server: cpu or gpu. + +To see the full list of flags, run `python tf_cnn_benchmarks.py --help`. + +To run ResNet50 with real data with 8 GPUs, run: + +``` +python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=256 \ +--model=resnet50 --optimizer=momentum --variable_update=replicated \ +--nodistortions --gradient_repacking=8 --num_gpus=8 \ +--num_epochs=90 --weight_decay=1e-4 --data_dir=${DATA_DIR} --use_fp16 \ +--train_dir=${CKPT_DIR} +``` +This will train a ResNet-50 model on ImageNet with 2048 batch size on 8 +GPUs. The model should train to around 76% accuracy. + +## Running the tests + +To run the tests, run + +```bash +pip install portpicker +python run_tests.py && python run_tests.py --run_distributed_tests +``` + +Note the tests require portpicker. + +The command above runs a subset of tests that is both fast and fairly +comprehensive. Alternatively, all the tests can be run, but this will take a +long time: + +```bash +python run_tests.py --full_tests && python run_tests.py --full_tests --run_distributed_tests +``` + +We will run all tests on every PR before merging them, so it is not necessary +to pass `--full_tests` when running tests yourself. + +To run an individual test, such as method `testParameterServer` of test class +`TfCnnBenchmarksTest` of module `benchmark_cnn_test`, run + +```bash +python -m unittest -v benchmark_cnn_test.TfCnnBenchmarksTest.testParameterServer +``` diff --git a/cv/classification/resnet50/tensorflow/all_reduce_benchmark.py b/cv/classification/resnet50/tensorflow/all_reduce_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..155861c099089c59fe3439e6ef18b5e7e48d81ab --- /dev/null +++ b/cv/classification/resnet50/tensorflow/all_reduce_benchmark.py @@ -0,0 +1,290 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Benchmarks the all-reduce algorithms of tf_cnn_benchmarks. + +tf_cnn_benchmarks uses all-reduce to aggregate gradients. This benchmark is +useful for benchmarking the performance of just this gradient aggregation, +instead of the entire model. All the flags that tf_cnn_benchmarks accepts are +also accepted by this script, although many are silently ignored. + +The number and shapes of the tensors all-reduced are those of the variables of +the model specified by the --model flag. +TODO(reedwm): Allow custom sizes to be specified. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import os +import time + +from absl import app +from absl import flags as absl_flags +import tensorflow.compat.v1 as tf + +from tensorflow.python.ops import control_flow_ops +import benchmark_cnn +import cnn_util +import flags +from cnn_util import log_fn + + +absl_flags.DEFINE_integer('iters_per_step', 5, + 'Number of iterations to run all-reduce for, per ' + 'step. Every step, a session will be run on a Graph ' + 'that contains this many copies of the all-reduce. ' + 'The copies are run sequentially. Setting this above ' + '1 is useful to lower the overhead of starting the ' + 'session run, running the VariableV2 ops at the ' + 'start of the step, etc.') + + +flags.define_flags() +for name in flags.param_specs.keys(): + absl_flags.declare_key_flag(name) + + +def get_var_shapes(model): + """Returns the list of variable shapes for a tf_cnn_benchmarks Model.""" + with tf.Graph().as_default(): + # The variable shapes do not depend on the batch size. + images = tf.placeholder(tf.float32, model.get_input_shapes('train')[0]) + model.build_network([images]) + return [[int(d) for d in v.shape.dims] for v in tf.trainable_variables()] + + +def all_reduce(all_device_tensors, variable_mgr): + """Performs a single batch all-reduce. + + Args: + all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is + a tensor, where t is the tower the tensor is on and i is the index of + the tensor. + variable_mgr: The VariableMgr to perform the all-reduce. + Returns: + List of list of tensors in the same form as `all_device_tensors`, except the + tensors are aggregated across towers. + """ + tower_grads = [[(g, None) for g in device_tensors] for + device_tensors in all_device_tensors] + _, aggregated_tower_grads = variable_mgr.preprocess_device_grads(tower_grads) + return [ + [g for g, _ in agg_device_tensors] + for agg_device_tensors in aggregated_tower_grads] + + +def build_all_reduce_iterations(all_device_tensors, tower_devices, variable_mgr, + num_iters): + """Builds the all-reduce ops for multiple iterations to aggregate tensors. + + The tensors in `all_device_tensors` are aggregated `num_iters` times. Each + iteration aggregates the results from the previous iteration. The iterations + are run sequentially, so the aggregations for an iteration do not start + running until the previous iteration has completed. Each iteration after the + first is aggregating already-aggregated values, but it does not matter because + we are only aggregating for benchmarking purposes. + + Args: + all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is + a tensor, where t is the tower the tensor is on and i is the index of + the tensor. + tower_devices: A list of device strings. tower_devices[t] is the device + of the tensors in all_device_tensors[t]. + variable_mgr: The VariableMgr to perform the all-reduce. + num_iters: Number of iterations to aggregate tensors for. + Returns: + An op that when run, causes the all-reduce ops to run. + """ + for i in range(num_iters): + with tf.name_scope('iteration_%d' % i): + # Step 1: Do the aggregation. + with tf.name_scope('tensor_aggregation'): + all_device_tensors = all_reduce(all_device_tensors, variable_mgr) + + # Step 2. Create identity ops, to bring the aggregated results back to + # each device. + new_all_device_tensors = [] + for device, device_tensors in zip(tower_devices, all_device_tensors): + with tf.device(device): + new_all_device_tensors.append([ + tf.identity(t, name='identity_after_allreduce') + for t in device_tensors + ]) + all_device_tensors = new_all_device_tensors + + # Step 3. Add control dependencies to delay the next iteration until this + # iteration is complete. To avoid extra overhead, we do not have any + # cross-device control dependencies, which means it's possible for two + # iterations to slightly overlap. + new_all_device_tensors = [] + for device_tensors in all_device_tensors: + new_all_device_tensors.append([ + control_flow_ops.with_dependencies( + device_tensors, t, name='identity_after_dependencies') + for t in device_tensors + ]) + all_device_tensors = new_all_device_tensors + + # To prevent the dependency optimizer from removing every op we created, + # we store the results in variables. + ops_to_run = [] + for device, device_tensors in zip(tower_devices, all_device_tensors): + with tf.device(device): + for t in device_tensors: + # The placeholder initial value is never run. + var = tf.Variable(tf.placeholder(tf.float32, t.shape), collections=[]) + ops_to_run.append(var.assign(t)) + return tf.group(*ops_to_run) + + +def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters): + """Builds the graph for the benchmark. + + Args: + tower_devices: A list of device strings of the devices to run the all-reduce + benchmark on. + tensor_shapes: A list of shapes of the tensors that will be aggregated for + the all-reduce. + variable_mgr: The VariableMgr to perform the all-reduce. + num_iters: Number of iterations to aggregate tensors for. + Returns: + An op that runs the benchmark. + """ + all_device_tensors = [] + for i, tower_device in enumerate(tower_devices): + with tf.device(tower_device): + device_tensors = [] + for j, shape in enumerate(tensor_shapes): + tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32), + name='tensor_%d_on_device_%d' % (j, i)) + device_tensors.append(tensor) + all_device_tensors.append(device_tensors) + + log_fn('Building all-reduce ops') + benchmark_op = build_all_reduce_iterations(all_device_tensors, tower_devices, + variable_mgr, num_iters) + log_fn('Done building all-reduce ops') + return benchmark_op + + +def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op): + """Runs the graph for the benchmark. + + Args: + benchmark_op: An op that runs the benchmark. + bench_cnn: The BenchmarkCNN where params and other attributes are obtained. + init_ops: A list of ops that are run before `benchmark_op` for + initialization. + dummy_loss_op: Any op. We must pass a loss op to + `benchmark_cnn.benchmark_one_step`, but the result of the op is never + actually used. + """ + config = benchmark_cnn.create_config_proto(bench_cnn.params) + with tf.Session(config=config) as sess: + for op in init_ops: + sess.run(op) + step_train_times = [] + fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op} + log_fn('Running warmup') + for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches): + if i == 0: + log_fn('Running all-reduce ops') + start = time.time() + if i > 0 and i % bench_cnn.params.display_every == 0: + log_fn('Iteration: %d. Average time per step so far: %s' % + (i, (time.time() - start) / i)) + # Call benchmark_one_step instead of directly calling sess.run(...), to + # potentially get a trace file, partitioned graphs, etc. + benchmark_cnn.benchmark_one_step( + sess=sess, + fetches=fetches, + step=i, + # The batch size is only used for the images/sec calculation, which is + # not actually calculated because we pass show_images_per_sec=False. + batch_size=None, + step_train_times=step_train_times, + trace_filename=bench_cnn.trace_filename, + partitioned_graph_file_prefix=( + bench_cnn.params.partitioned_graph_file_prefix), + profiler=None, + image_producer=None, + params=bench_cnn.params, + show_images_per_sec=False) + log_fn('Average time per step: %s' % + ((time.time() - start) / bench_cnn.num_batches)) + + +def run_benchmark(bench_cnn, num_iters): + """Runs the all-reduce benchmark. + + Args: + bench_cnn: The BenchmarkCNN where params, the variable manager, and other + attributes are obtained. + num_iters: Number of iterations to do all-reduce for for. + + Raises: + ValueError: Invalid params of bench_cnn. + """ + if bench_cnn.params.variable_update != 'replicated': + raise ValueError('--variable_update=replicated must be specified to use' + 'the all-reduce benchmark') + if bench_cnn.params.variable_consistency == 'relaxed': + raise ValueError('--variable_consistency=relaxed is not supported') + + benchmark_op = build_graph(bench_cnn.raw_devices, + get_var_shapes(bench_cnn.model), + bench_cnn.variable_mgr, num_iters) + init_ops = [ + tf.global_variables_initializer(), + bench_cnn.variable_mgr.get_post_init_ops() + ] + loss_op = tf.no_op() + + if bench_cnn.graph_file: + path, filename = os.path.split(bench_cnn.graph_file) + as_text = filename.endswith('txt') + log_fn('Writing GraphDef as %s to %s' % ( + 'text' if as_text else 'binary', bench_cnn.graph_file)) + tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True), + path, filename, as_text) + + run_graph(benchmark_op, bench_cnn, init_ops, loss_op) + + +# TODO(reedwm): Reduce redundancy with tf_cnn_benchmarks +def main(positional_arguments): + # Command-line arguments like '--distortions False' are equivalent to + # '--distortions=True False', where False is a positional argument. To prevent + # this from silently running with distortions, we do not allow positional + # arguments. + assert len(positional_arguments) >= 1 + if len(positional_arguments) > 1: + raise ValueError('Received unknown positional arguments: %s' + % positional_arguments[1:]) + + params = benchmark_cnn.make_params_from_flags() + params = benchmark_cnn.setup(params) + bench = benchmark_cnn.BenchmarkCNN(params) + + tfversion = cnn_util.tensorflow_version_tuple() + log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) + + run_benchmark(bench, absl_flags.FLAGS.iters_per_step) + +if __name__ == '__main__': + tf.disable_v2_behavior() + app.run(main) # Raises error on invalid flags, unlike tf.app.run() diff --git a/cv/classification/resnet50/tensorflow/all_reduce_benchmark_test.py b/cv/classification/resnet50/tensorflow/all_reduce_benchmark_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c8efd53f421049e697a4eeea7486a758c5a52a6c --- /dev/null +++ b/cv/classification/resnet50/tensorflow/all_reduce_benchmark_test.py @@ -0,0 +1,52 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for all_reduce_benchmark.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf + +import all_reduce_benchmark +import benchmark_cnn +import test_util + + +class AllReduceBenchmarkTest(tf.test.TestCase): + """Tests the all-reduce benchmark.""" + + def _test_run_benchmark(self, params): + """Tests that run_benchmark() runs successfully with the params.""" + logs = [] + with test_util.monkey_patch(all_reduce_benchmark, + log_fn=test_util.print_and_add_to_list(logs)): + bench_cnn = benchmark_cnn.BenchmarkCNN(params) + all_reduce_benchmark.run_benchmark(bench_cnn, num_iters=5) + self.assertRegex(logs[-1], '^Average time per step: [0-9.]+$') + + def test_run_benchmark(self): + """Tests that run_benchmark() runs successfully.""" + params = benchmark_cnn.make_params(num_batches=10, + variable_update='replicated', + num_gpus=2) + self._test_run_benchmark(params) + params = params._replace(hierarchical_copy=True, gradient_repacking=8, + num_gpus=8) + self._test_run_benchmark(params) + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/allreduce.py b/cv/classification/resnet50/tensorflow/allreduce.py new file mode 100644 index 0000000000000000000000000000000000000000..fa51f843444b543622ec01c3322a282ea0fc5139 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/allreduce.py @@ -0,0 +1,648 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utilities for allreduce.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections as pycoll +import re + +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow.compat.v1 as tf + +# pylint: disable=g-direct-tensorflow-import +try: + from tensorflow.python.distribute.v1 import all_reduce +except: + from tensorflow.python.distribute import all_reduce +from tensorflow.python.framework import device as pydev +from tensorflow.python.framework import ops +from tensorflow.python.ops import collective_ops + +AllReduceSpecTuple = pycoll.namedtuple('AllReduceSpecTuple', 'alg shards limit') + + +def parse_general_int(s): + """Parse integer with power-of-2 suffix eg. 32k.""" + mo = re.match(r'(\d+)([KkMGT]?)$', s) + if mo: + i, suffix = mo.group(1, 2) + v = int(i) + if suffix: + if suffix == 'K' or suffix == 'k': + v *= 1024 + elif suffix == 'M': + v *= (1024 * 1024) + elif suffix == 'G': + v *= (1024 * 1024 * 1024) + elif suffix == 'T': + v *= (1024 * 1024 * 1024 * 1024) + else: + raise ValueError('invalid integer string %s' % s) + return v + else: + v = int(s) + return v + + +def parse_all_reduce_spec(all_reduce_spec): + """Parse all_reduce_spec. + + Args: + all_reduce_spec: a string specifying a combination of all-reduce + algorithms to apply for gradient reduction. + + Returns: + a list of AllReduceSpecTuple. + + Raises: + ValueError: all_reduce_spec is not well-formed. + + An all_reduce_spec has BNF form: + int ::= positive whole number + g_int ::= int[KkMGT]? + alg_spec ::= alg | alg#int + range_spec ::= alg_spec | alg_spec/alg_spec + spec ::= range_spec | range_spec:g_int:range_spec + + Not all syntactically correct specifications are supported. + Examples of supported all_reduce_spec strings, with semantics explained: + + 'collective' == apply tf.collective_reduce operator to all tensors. + 'collective#2' == apply tf.collective_reduce operator to all tensors, + requesting up to 2 simultaneous transfers at each node, if + feasible, by subdividing tensor by an additional factor of 2. + 'xring' == apply ring all-reduce to all tensors + 'xring#2' == apply ring all-reduce to all tensors, using two simultaneous + transfer rings, each operating on 1/2 of each tensor. + 'nccl' == apply NCCL all-reduce to all tensors (only works within + a single worker process where all devices are GPUs) + 'nccl/xring' == apply NCCL all-reduce to all tensors within each worker + to produce at least one full-reduced (locally) value, + then apply ring all-reduce to one such value from each + worker, then apply NCCL broadcast to propagate those globally + reduced values back to every device within each worker. + 'pscpu' == Shuffle reduce using worker CPUs as the gather devices: each + distributed tensor is reduced by copying all instances to + one of the worker CPUs, computing the reduction there, then + copying back to each participating device. Tensor reductions + are assigned to specific CPUs round-robin. + 'psgpu#4' == Arrange all GPUs across all workers into groups of 4. + Each distributed tensor is shuffle reduced against one + such group of 4 GPUs, selected round-robin. That is, each + tensor is split across 4 shards for the reduction. + 'pscpu:2k:pscpu#2:64k:xring' == Apply single-shard pscpu to + tensors of size <= 2048 elements, apply 2-shard pscpu to + tensors up to size 64k elements, apply xring to larger tensors. + 'pscpu/pscpu#2' == Use shuffle gather to locally reduce each tensor on + the worker's CPU, then use 2-shard shuffle to reduce those + locally reduced tensors across workers (on the worker CPUs), then + scatter the globally reduced values locally from each worker CPU. + """ + range_parts = all_reduce_spec.split(':') + ['-1'] + if len(range_parts) % 2: + raise ValueError('all_reduce_spec not well formed: %s' % all_reduce_spec) + limit = 0 + spec = [] + alg = None + shards = 1 + for i, range_part in enumerate(range_parts): + if i % 2 == 1: + try: + limit = parse_general_int(range_part) + spec.append(AllReduceSpecTuple(alg=alg, shards=shards, limit=limit)) + except ValueError: + raise ValueError('all_reduce_spec (%s) contains non-integer range %s' % + (all_reduce_spec, range_part)) + else: + alg = range_part + alg_parts = range_part.split('#') + alg = alg_parts[0] + if len(alg_parts) > 1: + try: + shards = int(alg_parts[1]) + except ValueError: + raise ValueError('all_reduce_spec (%s) contains non-integer ' + 'shards %s' % all_reduce_spec, alg_parts[1]) + else: + shards = 1 + if alg not in [ + 'nccl', 'nccl/xring', 'nccl/rechd', 'nccl/pscpu', 'xring', 'pscpu', + 'psgpu', 'pscpu/pscpu', 'collective' + ]: + raise ValueError('all_reduce_spec (%s) contains invalid alg %s' % + (all_reduce_spec, alg)) + return spec + + +def build_all_reduce_device_prefixes(job_name, num_tasks): + """Build list of device prefix names for all_reduce. + + Args: + job_name: 'worker', 'ps' or 'localhost'. + num_tasks: number of jobs across which device names should be generated. + + Returns: + A list of device name prefix strings. Each element spells out the full + host name without adding the device. + e.g. '/job:worker/task:0' + """ + if job_name != 'localhost': + return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)] + else: + assert num_tasks == 1 + return ['/job:%s' % job_name] + + +def group_device_names(devices, group_size): + """Group device names into groups of group_size. + + Args: + devices: list of strings naming devices. + group_size: int >= 1 + + Returns: + list of lists of devices, where each inner list is group_size long, + and each device appears at least once in an inner list. If + len(devices) % group_size = 0 then each device will appear + exactly once. + + Raises: + ValueError: group_size > len(devices) + """ + num_devices = len(devices) + if group_size > num_devices: + raise ValueError('only %d devices, but group_size=%d' % (num_devices, + group_size)) + num_groups = ( + num_devices // group_size + (1 if (num_devices % group_size != 0) else 0)) + groups = [[] for i in range(num_groups)] + for i in range(0, num_groups * group_size): + groups[i % num_groups].append(devices[i % num_devices]) + return groups + + +def split_grads_by_size(threshold_size, device_grads): + """Break gradients into two sets according to tensor size. + + Args: + threshold_size: int size cutoff for small vs large tensor. + device_grads: List of lists of (gradient, variable) tuples. The outer + list is over devices. The inner list is over individual gradients. + + Returns: + small_grads: Subset of device_grads where shape is <= theshold_size + elements. + large_grads: Subset of device_grads where shape is > threshold_size + elements. + """ + small_grads = [] + large_grads = [] + for dl in device_grads: + small_dl = [] + large_dl = [] + for (g, v) in dl: + tensor_size = g.get_shape().num_elements() + if tensor_size <= threshold_size: + small_dl.append([g, v]) + else: + large_dl.append([g, v]) + if small_dl: + small_grads.append(small_dl) + if large_dl: + large_grads.append(large_dl) + return small_grads, large_grads + + +_instance_key = 1 + + +def new_collective_instance_key(): + """Returns a new instance key for use in defining a collective op.""" + global _instance_key + v = _instance_key + _instance_key += 1 + return v + + +_group_key = 1 +_group_key_table = dict() + + +def collective_group_key(devices): + """Returns a group key for the set of devices. + + Args: + devices: list of strings naming devices in a collective group. + + Returns: + int key uniquely identifying the set of device names. + """ + global _group_key + global _group_key_table + parsed = [pydev.DeviceSpec.from_string(d) for d in devices] + names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed]) + concat = ','.join(names) + if concat not in _group_key_table.keys(): + new_key = _group_key + _group_key += 1 + _group_key_table[concat] = new_key + rv = _group_key_table[concat] + return rv + + +def build_collective_reduce(input_tensors, num_workers, num_shards, + red_op='Add', un_op='Id'): + """Build a subgraph that does one full all-reduce, using the collective Op. + + Args: + input_tensors: tensors within a single worker graph that are to be reduced + together; must be one per device. + num_workers: total number of workers with identical independent graphs that + will be doing this same reduction. The reduction will actually include + the corresponding tensors at all these workers. + num_shards: number of shards into which to divide each per-tick chunk, + normally 1 but could be higher on multi-data-path architectures. + red_op: string naming the reduction op + un_op: string naming the unary final op + + Returns: + An array of final tensors, one per device, computed by the full reduction. + + Raises: + ValueError: There must be at least two tensors over all the workers. + """ + group_size = len(input_tensors) * num_workers + if group_size < 2: + raise ValueError('num_workers * len(input_tensors) must be 2 or greater') + devices = [t.device for t in input_tensors] + num_devices = len(devices) + group_key = collective_group_key(devices) + instance_key = new_collective_instance_key() + out_tensors = [] + if num_shards == 1: + subdiv_offsets = [0] + elif num_shards == 2: + if num_devices > 1: + subdiv_offsets = [0, -(num_devices // 2)] + else: + subdiv_offsets = [0] + else: + raise ValueError('Unsupported num_shards %d' % num_shards) + for d in range(num_devices): + with ops.device(devices[d]): + reduce_op = collective_ops.all_reduce(input_tensors[d], + group_size, group_key, instance_key, + red_op, un_op, + subdiv_offsets) + out_tensors.append(reduce_op) + return out_tensors + + +def broadcast_send(t, shape, dtype, group_size, group_key, instance_key): + return collective_ops.broadcast_send(t, shape, dtype, group_size, group_key, + instance_key) + + +def broadcast_recv(shape, dtype, group_size, group_key, instance_key): + return collective_ops.broadcast_recv(shape, dtype, group_size, group_key, + instance_key) + + +def sum_grad_and_var_all_reduce(single_session, + grad_and_vars, + num_workers, + alg, + gpu_indices, + aux_devices=None, + num_shards=1): + """Apply all-reduce algorithm over specified gradient tensors.""" + scaled_grads = [g for g, _ in grad_and_vars] + if alg == 'collective': + assert not single_session + summed_grads = build_collective_reduce( + scaled_grads, num_workers, num_shards, 'Add', 'Id') + else: + with tf.name_scope('allreduce'): + # Note that each grad_and_vars looks like the following: + # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) + if alg == 'nccl': + summed_grads = all_reduce.build_nccl_all_reduce(scaled_grads, tf.add) + elif alg == 'xring': + summed_grads = all_reduce.build_ring_all_reduce( + scaled_grads, num_workers, num_shards, gpu_indices, tf.add) + elif alg == 'nccl/xring': + summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards, + tf.add) + elif alg == 'nccl/rechd': + summed_grads = all_reduce.build_nccl_then_recursive_hd( + scaled_grads, tf.add) + elif alg == 'nccl/pscpu': + summed_grads = all_reduce.build_nccl_then_shuffle( + scaled_grads, aux_devices, tf.add, tf.add_n) + elif alg == 'pscpu/pscpu': + summed_grads = all_reduce.build_shuffle_then_shuffle( + scaled_grads, + aux_devices, + # TODO(tucker): devise a way of better specifying the device set + # for the second level. + [aux_devices[0]], + tf.add_n) + elif alg in ['pscpu', 'psgpu']: + summed_grads = all_reduce.build_shuffle_all_reduce( + scaled_grads, aux_devices, tf.add_n) + else: + raise ValueError('unsupported all_reduce alg: ', alg) + + result = [] + for (_, v), g in zip(grad_and_vars, summed_grads): + result.append([g, v]) + return result + + +def contains_any(haystack, needles): + """Tests if any needle is a substring of haystack. + + Args: + haystack: a string + needles: list of strings + + Returns: + True if any element of needles is a substring of haystack, + False otherwise. + """ + for n in needles: + if n in haystack: + return True + return False + + +def sum_gradients_all_reduce(single_session, + dev_prefixes, + tower_grads, + num_workers, + alg, + num_shards, + gpu_indices, + agg_small_grads_max_bytes=0, + agg_small_grads_max_group=10, + allreduce_merge_scope=1): + """Apply all-reduce algorithm over specified gradient tensors. + + Args: + single_session: true if reduction is applied to one graph across + all workers, false if ths application is to a single-worker graph only. + dev_prefixes: list of prefix strings to use to generate PS device names. + tower_grads: the gradients to reduce. + num_workers: number of worker processes across entire job. + alg: the all-reduce algorithm to apply. + num_shards: alg-specific sharding factor. + gpu_indices: indices of local GPUs in order usable for ring-reduce. + agg_small_grads_max_bytes: largest tensor eligible for aggregation, + in number of bytes. + agg_small_grads_max_group: largest permitted aggregation of small + tensors. + allreduce_merge_scope: size of groups into which to partition consecutive + gradients grouped under a common 'allreduce' name scope for application + of ScopedAllocator optimization. + + Returns: + list of reduced tensors + """ + alg_contains_shuffle = contains_any(alg, ['pscpu', 'psgpu']) + is_hierarchical = '/' in alg + if 'pscpu' in alg: + aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes] + elif 'psgpu' in alg: + aux_devices = [ + prefix + '/gpu:%d' % i + for i in range(len(gpu_indices)) + for prefix in dev_prefixes + ] + else: + aux_devices = ['/job:localhost/cpu:0'] + aux_device_groups = group_device_names( + aux_devices, + num_shards if (alg != 'collective' and alg_contains_shuffle) else 1) + group_index = 0 + if agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0: + tower_grads, packing = pack_small_tensors( + tower_grads, + max_bytes=agg_small_grads_max_bytes, + max_group=agg_small_grads_max_group) + else: + packing = None + reduced_gv_list = [] + gv = list(zip(*tower_grads)) + merge_scope = allreduce_merge_scope if allreduce_merge_scope > 0 else 1 + chunked_gv = [gv[x:x + merge_scope] + for x in xrange(0, len(gv), merge_scope)] + for chunk in chunked_gv: + with tf.name_scope('allreduce'): + for grad_and_vars in chunk: + reduced_gv_list.append(sum_grad_and_var_all_reduce( + single_session, + grad_and_vars, num_workers, alg, gpu_indices, + (aux_devices if is_hierarchical + else aux_device_groups[group_index]), + num_shards)) + group_index = (group_index + 1) % len(aux_device_groups) + new_tower_grads = [list(x) for x in zip(*reduced_gv_list)] + if packing: + new_tower_grads = unpack_small_tensors(new_tower_grads, packing) + return new_tower_grads + + +def extract_ranges(index_list, range_size_limit=32): + """Extract consecutive ranges and singles from index_list. + + Args: + index_list: List of monotone increasing non-negative integers. + range_size_limit: Largest size range to return. If a larger + consecutive range exists it will be returned as multiple + ranges. + + Returns: + ranges, singles where ranges is a list of [first, last] pairs of + consecutive elements in index_list, and singles is all of the + other elements, in original order. + """ + if not index_list: + return [], [] + first = index_list[0] + last = first + ranges = [] + singles = [] + for i in index_list[1:]: + if i == last + 1 and (last - first) <= range_size_limit: + last = i + else: + if last > first: + ranges.append([first, last]) + else: + singles.append(first) + first = i + last = i + if last > first: + ranges.append([first, last]) + else: + singles.append(first) + return ranges, singles + + +GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes') + + +def pack_range(key, packing, grad_vars, rng): + """Form the concatenation of a specified range of gradient tensors. + + Args: + key: Value under which to store meta-data in packing that will be used + later to restore the grad_var list structure. + packing: Dict holding data describing packed ranges of small tensors. + grad_vars: List of (grad, var) pairs for one tower. + rng: A pair of integers giving the first, last indices of a consecutive + range of tensors to be packed. + + Returns: + A tensor that is the concatenation of all the specified small tensors. + """ + to_pack = grad_vars[rng[0]:rng[1] + 1] + members = [] + variables = [] + restore_shapes = [] + with tf.name_scope('pack'): + for g, v in to_pack: + variables.append(v) + restore_shapes.append(g.shape) + with tf.device(g.device): + members.append(tf.reshape(g, [-1])) + packing[key] = GradPackTuple( + indices=range(rng[0], rng[1] + 1), + vars=variables, + shapes=restore_shapes) + with tf.device(members[0].device): + return tf.concat(members, 0) + + +def unpack_grad_tuple(gv, gpt): + """Unpack a previously packed collection of gradient tensors. + + Args: + gv: A (grad, var) pair to be unpacked. + gpt: A GradPackTuple describing the packing operation that produced gv. + + Returns: + A list of (grad, var) pairs corresponding to the values that were + originally packed into gv, maybe following subsequent operations like + reduction. + """ + elt_widths = [x.num_elements() for x in gpt.shapes] + with tf.device(gv[0][0].device): + with tf.name_scope('unpack'): + splits = tf.split(gv[0], elt_widths) + unpacked_gv = [] + for idx, s in enumerate(splits): + unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx])) + return unpacked_gv + + +def pack_small_tensors(tower_grads, max_bytes=0, max_group=0): + """Concatenate small gradient tensors together for reduction. + + Args: + tower_grads: List of lists of (gradient, variable) tuples. + max_bytes: Int giving max number of bytes in a tensor that + may be considered small. + max_group: Int giving max number of small tensors that may be + concatenated into one new tensor. + + Returns: + new_tower_grads, packing where new_tower_grads is identical to + tower_grads except that all feasible small_tensors have been removed + from their places and concatenated into larger tensors that are + now in the front of the list for each tower, and packing contains + the data necessary to restore the tower_grads structure. + + Look through the first tower for gradients of the same type (float), + and small size, that are all sequential. For each such group, + replace by a new tensor that is a flattened concatenation. Note + that the corresponding variable will be absent, which doesn't matter + because it isn't used during all-reduce. + + Requires: + Every gv_list in towers must have isomorphic structure including identical + tensor sizes and types. + """ + small_indices = [] + large_indices = [] + for idx, (g, _) in enumerate(tower_grads[0]): + if g.dtype == tf.float32 and (4 * g.shape.num_elements()) <= max_bytes: + small_indices.append(idx) + else: + large_indices.append(idx) + small_ranges, small_singles = extract_ranges( + small_indices, range_size_limit=max_group) + large_indices = sorted(large_indices + small_singles) + num_gv = len(tower_grads[0]) + packing = {} + if small_ranges: + new_tower_grads = [] + for dev_idx, gv_list in enumerate(tower_grads): + assert len(gv_list) == num_gv + new_gv_list = [] + for r in small_ranges: + key = '%d:%d' % (dev_idx, len(new_gv_list)) + new_gv_list.append((pack_range(key, packing, gv_list, r), + 'packing_var_placeholder')) + for i in large_indices: + new_gv_list.append(gv_list[i]) + new_tower_grads.append(new_gv_list) + return new_tower_grads, packing + else: + return tower_grads, None + + +def unpack_small_tensors(tower_grads, packing): + """Undo the structure alterations to tower_grads done by pack_small_tensors. + + Args: + tower_grads: List of List of (grad, var) tuples. + packing: A dict generated by pack_small_tensors describing the changes + it made to tower_grads. + + Returns: + new_tower_grads: identical to tower_grads except that concatentations + of small tensors have been split apart and returned to their original + positions, paired with their original variables. + """ + if not packing: + return tower_grads + new_tower_grads = [] + num_devices = len(tower_grads) + num_packed = len(packing.keys()) // num_devices + for dev_idx, gv_list in enumerate(tower_grads): + new_gv_list = gv_list[num_packed:] + for i in xrange(0, num_packed): + k = '%d:%d' % (dev_idx, i) + gpt = packing[k] + gv = unpack_grad_tuple(gv_list[i], gpt) + for gi, idx in enumerate(gpt.indices): + assert idx == gpt.indices[gi] + new_gv_list.insert(idx, gv[gi]) + new_tower_grads.append(new_gv_list) + return new_tower_grads diff --git a/cv/classification/resnet50/tensorflow/allreduce_test.py b/cv/classification/resnet50/tensorflow/allreduce_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a372d7ebfbaa4d4d42921549be67d7d7683837a3 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/allreduce_test.py @@ -0,0 +1,448 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for tf_cnn_benchmark.allreduce.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections as pycoll + +import numpy as np +import tensorflow.compat.v1 as tf +from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util +from tensorflow.python.ops import variables +import allreduce + + +class AllReduceTest(tf.test.TestCase): + + def testGroupKey(self): + d0 = ['/job:worker/replica:0/task:0/device:GPU:1', + '/job:worker/replica:0/task:0/device:GPU:0', + '/job:worker/replica:0/task:0/device:GPU:3',] + d1 = ['/job:worker/replica:0/task:1/device:GPU:1', + '/job:worker/replica:0/task:1/device:GPU:0', + '/job:worker/replica:0/task:1/device:GPU:3',] + d2 = ['/job:worker/replica:0/task:1/device:GPU:1', + '/job:worker/replica:0/task:1/device:GPU:3', + '/job:worker/replica:0/task:1/device:GPU:0',] + d3 = ['/job:worker/replica:0/task:1/device:GPU:1', + '/job:worker/replica:0/task:1/device:GPU:3', + '/job:worker/replica:0/task:1/device:GPU:2',] + d4 = ['/job:worker/task:0/device:GPU:1', + '/job:worker/task:0/device:GPU:2', + '/job:worker/task:0/device:GPU:3',] + d5 = ['/job:worker/task:0/device:CPU:1', + '/job:worker/task:0/device:CPU:2'] + d6 = ['/job:worker/task:0/device:CPU:2', + '/job:worker/task:0/device:CPU:1'] + g0 = allreduce.collective_group_key(d0) + g1 = allreduce.collective_group_key(d1) + g2 = allreduce.collective_group_key(d2) + g3 = allreduce.collective_group_key(d3) + g4 = allreduce.collective_group_key(d4) + g5 = allreduce.collective_group_key(d5) + g6 = allreduce.collective_group_key(d6) + self.assertEqual(g0, g1) + self.assertEqual(g0, g2) + self.assertTrue(g0 != g3) + self.assertEqual(g3, g4) + self.assertEqual(g5, g6) + self.assertTrue(g4 != g5) + + def testExtractRanges(self): + x = [] + expected_ranges = [] + expected_singles = [] + ranges, singles = allreduce.extract_ranges(x) + self.assertEqual(expected_ranges, ranges) + self.assertEqual(expected_singles, singles) + x = [1, 3, 4, 6, 7, 8, 9] + expected_ranges = [[3, 4], [6, 9]] + expected_singles = [1] + ranges, singles = allreduce.extract_ranges(x) + self.assertEqual(expected_ranges, ranges) + self.assertEqual(expected_singles, singles) + x = [1, 2, 3, 4, 6, 7, 8, 9] + expected_ranges = [[1, 4], [6, 9]] + expected_singles = [] + ranges, singles = allreduce.extract_ranges(x) + self.assertEqual(expected_ranges, ranges) + self.assertEqual(expected_singles, singles) + x = [1, 3, 4, 6, 7, 9] + expected_ranges = [[3, 4], [6, 7]] + expected_singles = [1, 9] + ranges, singles = allreduce.extract_ranges(x) + self.assertEqual(expected_ranges, ranges) + self.assertEqual(expected_singles, singles) + x = [1, 3, 6, 9] + expected_ranges = [] + expected_singles = [1, 3, 6, 9] + ranges, singles = allreduce.extract_ranges(x) + self.assertEqual(expected_ranges, ranges) + self.assertEqual(expected_singles, singles) + + def testPackRange(self): + packing = {} + t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32) + t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32) + + gv = [(t0, 'v0'), (t1, 'v1')] + new_t = allreduce.pack_range('0:0', packing, gv, [0, 1]) + self.assertEqual(1, new_t.shape.ndims) + self.assertEqual(8, new_t.shape.dims[0]) + self.assertEqual( + packing, { + '0:0': + allreduce.GradPackTuple( + indices=range(2), + vars=['v0', 'v1'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4])]) + }) + + t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32) + t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32) + gv = [(t0, 'v0'), (t1, 'v1'), (t2, 'v2'), (t3, 'v3')] + packing = {} + new_t = allreduce.pack_range('1:0', packing, gv, [0, 3]) + self.assertEqual(1, new_t.shape.ndims) + self.assertEqual(26, new_t.shape.dims[0]) + self.assertEqual( + packing, { + '1:0': + allreduce.GradPackTuple( + indices=range(4), + vars=['v0', 'v1', 'v2', 'v3'], + shapes=[ + tf.TensorShape([4]), + tf.TensorShape([4]), + tf.TensorShape([3, 3]), + tf.TensorShape([3, 3]) + ]) + }) + + def testUnpackGradTuple(self): + packing = { + '0:0': + allreduce.GradPackTuple( + indices=range(4), + vars=['v0', 'v1', 'v2', 'v3'], + shapes=[ + tf.TensorShape([4]), + tf.TensorShape([4]), + tf.TensorShape([3, 3]), + tf.TensorShape([3, 3]) + ]) + } + tc = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, + 0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32) + packed_gv = [tc, 'packing_var_placeholder'] + gv = allreduce.unpack_grad_tuple(packed_gv, packing['0:0']) + self.assertEqual(4, len(gv)) + self.assertEqual('v0', gv[0][1]) + self.assertEqual('v1', gv[1][1]) + self.assertEqual('v2', gv[2][1]) + self.assertEqual('v3', gv[3][1]) + self.assertEqual(1, gv[0][0].shape.ndims) + self.assertEqual(4, gv[0][0].shape.dims[0]) + self.assertEqual(1, gv[1][0].shape.ndims) + self.assertEqual(4, gv[1][0].shape.dims[0]) + self.assertEqual(2, gv[2][0].shape.ndims) + self.assertEqual(3, gv[2][0].shape.dims[0]) + self.assertEqual(3, gv[2][0].shape.dims[1]) + + def testPackSmallTensors(self): + t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32) + t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32) + t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32) + t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32) + tower_grads = [] + for d in range(0, 3): + gv = [(t0, 'v_%d_0' % d), (t1, 'v_%d_1' %d), (t2, 'v_%d_2' %d), + (t3, 'v_%d_3' % d)] + tower_grads.append(gv) + + # 1) Set the size limit so small that nothing gets concatenated. + new_tower_grads, packing = allreduce.pack_small_tensors( + tower_grads, max_bytes=12, + max_group=10) + self.assertEqual(tower_grads, new_tower_grads) + self.assertTrue(packing is None) + + # 2) Set the size limit so only the first two tensors get concatenated + new_tower_grads, packing = allreduce.pack_small_tensors( + tower_grads, max_bytes=16, # 16 bytes == 4 elements + max_group=10) + self.assertEqual(3, len(new_tower_grads)) + self.assertEqual(4, len(tower_grads[0])) + first_tower = new_tower_grads[0] + self.assertEqual(3, len(first_tower)) + self.assertEqual(1, first_tower[0][0].shape.ndims) + self.assertEqual(8, first_tower[0][0].shape.dims[0]) + self.assertEqual(packing, + {'0:0': allreduce.GradPackTuple( + indices=range(2), + vars=['v_0_0', 'v_0_1'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4])]), + '1:0': allreduce.GradPackTuple( + indices=range(2), + vars=['v_1_0', 'v_1_1'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4])]), + '2:0': allreduce.GradPackTuple( + indices=range(2), + vars=['v_2_0', 'v_2_1'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4])])}) + + # 3) Set the size limit so all tensors get concatenated + new_tower_grads, packing = allreduce.pack_small_tensors( + tower_grads, max_bytes=256, # bytes = 64 elements + max_group=10) + self.assertEqual(3, len(new_tower_grads)) + self.assertEqual(4, len(tower_grads[0])) + self.assertEqual(1, len(new_tower_grads[0])) + first_tower = new_tower_grads[0] + self.assertEqual(1, first_tower[0][0].shape.ndims) + self.assertEqual(26, first_tower[0][0].shape.dims[0]) + self.assertEqual(packing, + {'0:0': allreduce.GradPackTuple( + indices=range(4), + vars=['v_0_0', 'v_0_1', 'v_0_2', 'v_0_3'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4]), + tf.TensorShape([3, 3,]), + tf.TensorShape([3, 3,])]), + '1:0': allreduce.GradPackTuple( + indices=range(4), + vars=['v_1_0', 'v_1_1', 'v_1_2', 'v_1_3'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4]), + tf.TensorShape([3, 3,]), + tf.TensorShape([3, 3,])]), + '2:0': allreduce.GradPackTuple( + indices=range(4), + vars=['v_2_0', 'v_2_1', 'v_2_2', 'v_2_3'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4]), + tf.TensorShape([3, 3,]), + tf.TensorShape([3, 3,])])}) + + def testUnpackSmallTensors(self): + packing = {'0:0': allreduce.GradPackTuple(indices=range(2), + vars=['v_0_0', 'v_0_1'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4])]), + '0:1': allreduce.GradPackTuple(indices=range(3, 5), + vars=['v_0_3', 'v_0_4'], + shapes=[tf.TensorShape([3, 3,]), + tf.TensorShape([3, 3,])]), + '1:0': allreduce.GradPackTuple(indices=range(2), + vars=['v_1_0', 'v_1_1'], + shapes=[tf.TensorShape([4]), + tf.TensorShape([4])]), + '1:1': allreduce.GradPackTuple(indices=range(3, 5), + vars=['v_1_3', 'v_1_4'], + shapes=[tf.TensorShape([3, 3,]), + tf.TensorShape([3, 3,])])} + t0 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7], dtype=tf.float32) + t1 = tf.constant([17, 17], dtype=tf.float32) + t2 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, 8, + 0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32) + t3 = tf.constant([0], dtype=tf.float32) + tower_grads = [] + for d in range(0, 2): + one_tower = [(t0, 'packing_var_placeholder'), + (t2, 'packing_var_placeholder'), + (t1, 'v_%d_2' % d), (t3, 'v_%d_5' %d)] + tower_grads.append(one_tower) + new_tower_grads = allreduce.unpack_small_tensors(tower_grads, packing) + self.assertEqual(2, len(new_tower_grads)) + for d, tg in enumerate(new_tower_grads): + self.assertEqual(6, len(tg)) + self.assertEqual('v_%d_0' % d, tg[0][1]) + self.assertEqual('v_%d_1' % d, tg[1][1]) + self.assertEqual('v_%d_2' % d, tg[2][1]) + self.assertEqual('v_%d_3' % d, tg[3][1]) + self.assertEqual('v_%d_4' % d, tg[4][1]) + self.assertEqual('v_%d_5' % d, tg[5][1]) + self.assertEqual(1, tg[0][0].shape.ndims) + self.assertEqual(4, tg[0][0].shape.dims[0]) + self.assertEqual(1, tg[1][0].shape.ndims) + self.assertEqual(4, tg[1][0].shape.dims[0]) + self.assertEqual(1, tg[2][0].shape.ndims) + self.assertEqual(2, tg[2][0].shape.dims[0]) + self.assertEqual(2, tg[3][0].shape.ndims) + self.assertEqual(3, tg[3][0].shape.dims[0]) + self.assertEqual(3, tg[3][0].shape.dims[1]) + self.assertEqual(2, tg[4][0].shape.ndims) + self.assertEqual(3, tg[4][0].shape.dims[0]) + self.assertEqual(3, tg[4][0].shape.dims[1]) + self.assertEqual(1, tg[5][0].shape.ndims) + self.assertEqual(1, tg[5][0].shape.dims[0]) + + +class DynamicPackingTest(test_util.TensorFlowTestCase): + """Packing/Unpacking tests that require executing a TensorFlow session.""" + + def _init_tensors(self, num_towers, tensor_shapes): + """Construct a collection of tensors across multiple devices.""" + num_tensors = len(tensor_shapes) + consts = [] + tensors = [] + vrbls = [] + tower_grads = [] + tf.Variable([-1], dtype=tf.int32, name='packing_var_placeholder') + for dev_idx in range(0, num_towers): + devname = '/job:localhost/device:GPU:%d' % dev_idx + consts.append([]) + tensors.append([]) + vrbls.append([]) + with tf.device(devname): + base_value = 0 + gv_tuples = [] + for t_idx in range(0, num_tensors): + shape = tensor_shapes[t_idx] + num_elts = 0 + for d in shape: + num_elts = (num_elts or 1) * d + c = np.fromiter(range(base_value, base_value + num_elts), + dtype=np.float32).reshape(shape) + base_value += num_elts + consts[dev_idx].append(c) + tensors[dev_idx].append(tf.constant(c)) + vrbls[dev_idx].append( + tf.Variable(c, name='v_d%d_t%d' % (dev_idx, t_idx))) + gv_tuples.append((tensors[dev_idx][-1], vrbls[dev_idx][-1])) + tower_grads.append(gv_tuples) + return tower_grads, consts, tensors, vrbls + + _test_tuple = pycoll.namedtuple('_test_tuple', + 'num_devices, in_shapes out_shapes out_i') + + def _do_pack_unpack_test(self, tt): + """Do a single pack-unpack test. + + Args: + tt: A _test_tuple defining the parameters of the test to do. + + This test executes a graph that performs a pack of tower_grads + followed by an unpack and verifies that the shapes and values + of gradient tensors are unchanged, along with paired variables. + """ + with ops.Graph().as_default(): + tower_grads, consts, _, vrbls = self._init_tensors( + tt.num_devices, tt.in_shapes) + packed_tg, packing = allreduce.pack_small_tensors( + tower_grads, max_bytes=40, max_group=10) + unpacked_tg = allreduce.unpack_small_tensors(packed_tg, packing) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + packed = sess.run(packed_tg) + for d in range(0, tt.num_devices): + for t in range(0, len(tt.out_shapes)): + num_elts = 0 + for dim in tt.out_shapes[t]: + num_elts = (num_elts or 1) * dim + self.assertTrue(np.array_equal( + np.array(range(tt.out_i[t], tt.out_i[t] + num_elts), + dtype=np.float32).reshape(tt.out_shapes[t]), + packed[d][t][0])) + unpacked = sess.run(unpacked_tg) + for d in range(0, tt.num_devices): + for t in range(0, len(tt.in_shapes)): + self.assertTrue(np.array_equal(consts[d][t], unpacked[d][t][0])) + self.assertEqual(vrbls[d][t], unpacked_tg[d][t][1]) + + def testPackUnpack0(self): + self._do_pack_unpack_test( + self._test_tuple(num_devices=3, + in_shapes=[[8], [3, 3], [12], [5, 5, 5]], + out_shapes=[[17], [12], [5, 5, 5]], + out_i=[0, 17, 29])) + + def testPackUnpack1(self): + self._do_pack_unpack_test( + self._test_tuple(num_devices=4, + in_shapes=[[5, 5, 5], [2, 3], [5]], + out_shapes=[[11], [5, 5, 5]], + out_i=[125, 0])) + + def testPackUnpack2(self): + self._do_pack_unpack_test( + self._test_tuple(num_devices=2, + in_shapes=[[5, 5, 5], [2, 3], [1, 5], [7], [100]], + out_shapes=[[18], [5, 5, 5], [100]], + out_i=[125, 0, 143])) + + def _do_all_reduce_pack_test(self, tt): + """Test that all-reduce results are the same with or without packing.""" + with ops.Graph().as_default(): + tower_grads, consts, _, _ = self._init_tensors( + tt.num_devices, tt.in_shapes) + dev_prefixes = ['/job:localhost'] + num_workers = 1 + alg = 'xring' + shards = 1 + single_session = True + gpu_indices = range(0, tt.num_devices) + assert len(gpu_indices) == len(tower_grads) + no_pack_all_reduce = allreduce.sum_gradients_all_reduce( + single_session, + dev_prefixes, tower_grads, num_workers, alg, shards, + gpu_indices, + agg_small_grads_max_bytes=0, agg_small_grads_max_group=1) + packed_tg, packing = allreduce.pack_small_tensors(tower_grads, 100, 100) + packed_all_reduce = allreduce.sum_gradients_all_reduce( + single_session, + dev_prefixes, packed_tg, num_workers, alg, shards, + gpu_indices, + agg_small_grads_max_bytes=0, agg_small_grads_max_group=1) + unpacked_tg = allreduce.unpack_small_tensors(packed_all_reduce, packing) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + no_pack_values = sess.run(no_pack_all_reduce) + pack_unpack_values = sess.run(unpacked_tg) + for d in range(1, tt.num_devices): + for t in range(0, len(tt.in_shapes)): + self.assertTrue(np.allclose(no_pack_values[d][t][0], + tt.num_devices * consts[0][t])) + self.assertTrue(np.array_equal(no_pack_values[d][t][0], + pack_unpack_values[d][t][0])) + + def testAllReducePacked0(self): + self._do_all_reduce_pack_test( + self._test_tuple(num_devices=3, + in_shapes=[[8], [3, 3], [12], [5, 5, 5]], + out_shapes=[[17], [12], [5, 5, 5]], + out_i=[0, 17, 29])) + + def testAllReducePacked1(self): + self._do_all_reduce_pack_test( + self._test_tuple(num_devices=2, + in_shapes=[[8], [3, 3], [12], [5, 5, 5], [3], [4]], + out_shapes=[[17], [7], [12], [5, 5, 5]], + out_i=[0, 17, 29, 154, 157])) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/batch_allreduce.py b/cv/classification/resnet50/tensorflow/batch_allreduce.py new file mode 100644 index 0000000000000000000000000000000000000000..e36a39ed45b143302724cd7d5b6a9f2d5c952dad --- /dev/null +++ b/cv/classification/resnet50/tensorflow/batch_allreduce.py @@ -0,0 +1,628 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains classes and functions for doing a single-machine batch all-reduce. + +An all-reduce is taking the reduction (typically a sum) of a list of tensors, +each on a different device. The result must end up back on each device, which is +where the word "all" comes from. In summary, each device starts with a single +tensor, and ends up with the reduction of all tensors. + +A batch all-reduce is doing several independent all-reduces. When doing a batch +all-reduce, care is taken to evenly distribute the reduction computations +across devices and inter-device tensor transfers across device links. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# TODO(reedwm): Support distributed all-reduces in this file. +# TODO(reedwm): Merge this code with allreduce.py, which contains some batch +# all-reduce code that this file calls. allreduce.py also supports distributed +# batch-reduce while this file only supports single-machine all-reduce. + +import abc + +import six +import tensorflow.compat.v1 as tf + +from tensorflow.python.ops import data_flow_ops +import allreduce +import constants + + +def _all_reduce_using_copy(tensors_across_devices, use_mean): + """Does an all-reduce of a list of tensors by copying to the current device. + + The tensors are copied to the current device and then reduced. + + Args: + tensors_across_devices: A list of tensors, each on a different device. + use_mean: Whether to take the mean of the tensors instead of a sum: + Returns: + A reduced tensor on the current device. + """ + reduced_tensor = tf.add_n(tensors_across_devices) + if use_mean: + reduced_tensor *= 1 / len(tensors_across_devices) + return reduced_tensor + + +@six.add_metaclass(abc.ABCMeta) +class BatchAllReduceAlgorithm(object): + """Represents an algorithm for performing a batch all-reduce operation.""" + + def batch_all_reduce(self, + all_device_tensors, + num_splits, + compact_tensors, + defer_tensors, + xla_compile=False): + """Performs a batch all-reduce. + + The reduction done is a sum. + + `all_device_tensors` is a list of list of tensors that will be batch + all-reduced. All tensors within a single inner list must be on the same + device. The nth element in each list, for any n, will be reduced together. + The return value is in the same form as `all_device_tensors`, except that + each tensor is reduced. + + For example, if `all_device_tensors` is: + [[ A, B ], # A and B are on GPU 0 + [ C, D ]] # C and D are on GPU 1 + + Then the return value will be: + [[ A+C, B+D ], # These two tensors are on GPU 0 + [ A+C, B+D ]] # These two tensors are on GPU 1 + + Arguments: + all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` + is a tensor where `i` is the device index and `j` is the tensor index. + num_splits: If not None, tensors will be concatenated and split into this + many pieces during the all-reduce, then split back into their original + shapes afterwards. Has no impact on correctness and can improve + performance. Requires all tensors to be the same type. + compact_tensors: If True, tensors are casted to fp16 before being all- + reduced. Improves performance, but hurts numerical stability. + defer_tensors: If True, every time the return value + `reduced_all_device_tensors` is evaluated, the result will be the + reduced tensors values of `all_device_tensors` from the previous session + run instead of the current session run, or zero on the first session + run. This can improve performance. When training neural networks, + deferring gradients often does not harm training, so this can be used to + improve performance. + xla_compile: If True, use XLA to compile gradients packing and unpacking + ops. + + Returns: + reduced_all_device_tensors: A list in the same form as + `all_device_tensors`, except each tensor has been reduced. + warmup_ops: A list of ops needed to be run once before the all-reduce can + occur. + """ + + # Before all-reducing tensors, we do several preprocessing functions that + # can speed up the all-reduce. We undo these functions after all-reducing + # the tensors. + + # all_device_packed_tensors is a 2-d list of tensors indexed by + # [device_id][tensor_id], holding packed tensors from all devices involved + # in all-reduce. + all_device_packed_tensors = [] + + # all_device_warmup_ops is a 2-d list of ops indexed by + # [device_id][tensor_id], holding warmup_ops that need to be run once before + # all-reduce can occur. + all_device_warmup_ops = [] + + # all_device_put_ops is a 2-d list of ops indexed by + # [device_id][tensor_id], holding put ops for deferred tensors. They will be + # called in each all-reduce step automatically due to control dependency. + all_device_put_ops = [] + + # packers is a list of _TensorPacker, one for each device involved in + # all-reduce. + packers = [ + _TensorPacker(num_splits, compact_tensors) for _ in all_device_tensors + ] + + for packer, device_tensors in zip(packers, all_device_tensors): + + def pack_single_device_tensors(packer=packer, + device_tensors=device_tensors): + """Pack gradient tensors of a device.""" + packed_tensors = packer.maybe_concat_tensors(device_tensors) + packed_tensors = packer.maybe_compact_tensors(packed_tensors) + # When xla_compile=False, defer tensors after concat for better + # performance. + if defer_tensors and not xla_compile: + packed_tensors, put_ops, warmup_ops = defer_single_device_tensors( + packed_tensors) + all_device_put_ops.append(put_ops) + all_device_warmup_ops.append(warmup_ops) + packed_tensors = packer.maybe_split_tensors(packed_tensors) + return packed_tensors + + with tf.device(device_tensors[0].device): + if xla_compile: + packed_tensors = tf.xla.experimental.compile( + pack_single_device_tensors) + # When xla_compile=True, intermediate tensors in packing process are + # not materialized. Thus, we defer tensors after packing process is + # completed instead of in the middle of it. + if defer_tensors: + packed_tensors, put_ops, warmup_ops = defer_single_device_tensors( + packed_tensors) + all_device_put_ops.append(put_ops) + all_device_warmup_ops.append(warmup_ops) + else: + packed_tensors = pack_single_device_tensors() + + all_device_packed_tensors.append(packed_tensors) + + # Perform all-reduce on packed tensors. + all_device_tensors = self._do_batch_all_reduce(all_device_packed_tensors) + + all_device_unpacked_tensors = [] + for packer, device_tensors in zip(packers, all_device_tensors): + + def unpack_single_device_tensors(packer=packer, + device_tensors=device_tensors): + """Unpack gradient tensors of a device.""" + unpacked_tensors = packer.undo_maybe_split_tensors(device_tensors) + unpacked_tensors = packer.undo_maybe_compact_tensors(unpacked_tensors) + unpacked_tensors = packer.undo_maybe_concat_tensors(unpacked_tensors) + return unpacked_tensors + + with tf.device(device_tensors[0].device): + if xla_compile: + unpacked_device_tensor = tf.xla.experimental.compile( + unpack_single_device_tensors) + else: + unpacked_device_tensor = unpack_single_device_tensors() + + all_device_unpacked_tensors.append(unpacked_device_tensor) + + # Note: There is no undo operation for deferring tensors. But we do need to + # call _add_put_op_control_deps at the end if we deferred the tensors. + if defer_tensors: + all_device_unpacked_tensors = _add_put_op_control_deps( + all_device_unpacked_tensors, num_splits, all_device_put_ops) + + return all_device_unpacked_tensors, all_device_warmup_ops + + @abc.abstractmethod + def _do_batch_all_reduce(self, all_device_tensors): + """Performs a batch all-reduce. + + Unlike `self.batch_all_reduce`, this does not do any preprocessing of the + tensors. + + Args: + all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` + is a tensor where `i` is the device index and `j` is the tensor index. + Returns: + reduced_all_device_tensors: A list in the same form as + `all_device_tensors`, except each tensor has been reduced. + """ + pass + + +class CopyToDeviceAlgorithm(BatchAllReduceAlgorithm): + """An algorithm that copies tensors to be reduced to a specific device.""" + + def __init__(self, devices_to_reduce_on, use_mean=False): + self._devices = devices_to_reduce_on + self._use_mean = use_mean + + def _do_batch_all_reduce(self, all_device_tensors): + reduced_tensors = [] + for i, tensors_across_devices in enumerate(zip(*all_device_tensors)): + with tf.device(self._devices[i % len(self._devices)]): + reduced_tensor = _all_reduce_using_copy(tensors_across_devices, + self._use_mean) + reduced_tensors.append(reduced_tensor) + # The tensors will be brought back to each device once they are used. + return [reduced_tensors] * len(all_device_tensors) + + +class HierarchicalCopyAlgorithm(BatchAllReduceAlgorithm): + """An algorithm that uses hierarchical copies. This is only optimized for + eight devices connected in NetworkTopology.DGX1 or NetworkTopology.GCP_V100 + topology. + """ + + def __init__(self, network_topology): + """Initializer for HierarchicalCopyAlgorithm. + + Args: + network_topology: An instance of Enum class constants.NetworkTopology. + """ + self._network_topology = network_topology + + def _do_batch_all_reduce(self, all_device_tensors): + avail_devices = [device_tensors[0].device + for device_tensors in all_device_tensors] + reduced_tensors = [] + num_devices = len(avail_devices) + group_size = num_devices // 2 + for i, tensors_across_devices in enumerate(zip(*all_device_tensors)): + group_0_main_device, group_1_main_device = self.__get_main_devices( + i, num_devices) + if group_0_main_device < group_size: + group_0_begin = 0 + group_1_begin = group_size + else: + group_0_begin = group_size + group_1_begin = 0 + + # Reduce the first group. + group_0_tensors = tensors_across_devices[group_0_begin: + group_0_begin + group_size] + with tf.device(avail_devices[group_0_main_device]): + group_0_reduced_tensor = _all_reduce_using_copy(group_0_tensors, False) + + # Reduce the second group. + group_1_tensors = tensors_across_devices[group_1_begin: + group_1_begin + group_size] + with tf.device(avail_devices[group_1_main_device]): + group_1_reduced_tensor = _all_reduce_using_copy(group_1_tensors, False) + + # Reduce between the groups. + with tf.device(avail_devices[group_0_main_device]): + total_reduced_tensor = _all_reduce_using_copy( + [group_0_reduced_tensor, group_1_reduced_tensor], False) + + # Broadcast the result back into the root of each group. + with tf.device(avail_devices[group_0_main_device]): + group_0_reduced_tensor_bcast = tf.identity(total_reduced_tensor) + with tf.device(avail_devices[group_1_main_device]): + group_1_reduced_tensor_bcast = tf.identity(total_reduced_tensor) + + reduced_tensors_bcast = [] + for j in range(len(tensors_across_devices)): + with tf.device(avail_devices[j]): + # Broadcast the result back to each member in the group from the root. + if (group_0_main_device < group_size) == (j < group_size): + src_device_tensor = group_0_reduced_tensor_bcast + else: + src_device_tensor = group_1_reduced_tensor_bcast + reduced_tensors_bcast.append(tf.identity(src_device_tensor)) + + reduced_tensors.append(reduced_tensors_bcast) + + reduced_tensors = list(zip(*reduced_tensors)) + return reduced_tensors + + def __get_main_devices(self, tensor_index, num_devices): + """Returns the pair of main devices to use for initial reduction. + + Args: + tensor_index: Index of the current tensor in the list of tensors to copy. + num_devices: Total number of devices. + + Returns: + A tuple containing pair of main device indices for the initial + reduction. Then, the first element of the tuple should be used for the + final reduction. + + Raises: + ValueError: Invalid input arguments. + """ + if self._network_topology == constants.NetworkTopology.DGX1: + return tensor_index % num_devices, (tensor_index + + (num_devices // 2)) % num_devices + elif self._network_topology == constants.NetworkTopology.GCP_V100: + if num_devices != 8: + raise ValueError('HierarchicalCopy only supports eight devices in %s.' % + self._network_topology) + # TODO(hinsu): Generalize main device indices to handle any other + # isomorphic connection graph that connects two cliques using connections + # other than 0-5 and 2-7. + main_device_pairs = [(0, 5), (2, 7), (5, 0), (7, 2)] + return main_device_pairs[tensor_index % len(main_device_pairs)] + else: + # TODO(reedwm): make this logic more general for arbitrary topology. + raise ValueError( + 'HierarchicalCopy is not supported for %s network topology.' % + self._network_topology) + + +class AllReduceSpecAlgorithm(BatchAllReduceAlgorithm): + """An algorithm that uses an all reduce spec.""" + + def __init__(self, all_reduce_spec, gpu_indices, agg_small_grads_max_bytes, + agg_small_grads_max_group): + spec = allreduce.parse_all_reduce_spec(all_reduce_spec) + if len(spec) != 1: + raise ValueError( + 'Replicated mode does not support hybrid all-reduce strategies') + self._all_reduce_spec = spec[0] + self._gpu_indices = gpu_indices + self._agg_small_grads_max_bytes = agg_small_grads_max_bytes + self._agg_small_grads_max_group = agg_small_grads_max_group + + def _do_batch_all_reduce(self, all_device_tensors): + # TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other + # gradient aggregation code, since gradient aggregation is doing an all + # reduce. Currently, we do gradient repacking in two different places. + # TODO(reedwm): Change the allreduce code to reduce tensors instead of + # tower_grads. + tower_grads = [[(t, None) for t in device_tensors] + for device_tensors in all_device_tensors] + aggregated_device_grads = allreduce.sum_gradients_all_reduce( + False, # single_session + ['/job:localhost'], + tower_grads, + 1, + self._all_reduce_spec.alg, + self._all_reduce_spec.shards, + self._gpu_indices, + agg_small_grads_max_bytes=self._agg_small_grads_max_bytes, + agg_small_grads_max_group=self._agg_small_grads_max_group) + return [[t for t, _ in grad_vars] for grad_vars in aggregated_device_grads] + + +def algorithm_from_params(params): + """Returns a BatchAllReduceAlgorithm from a Params tuple.""" + if params.all_reduce_spec: + if params.gpu_indices: + gpu_indices = [int(x) for x in params.gpu_indices.split(',')] + else: + gpu_indices = [x for x in range(params.num_gpus)] + return AllReduceSpecAlgorithm(params.all_reduce_spec, gpu_indices, + params.agg_small_grads_max_bytes, + params.agg_small_grads_max_group) + elif params.hierarchical_copy: + return HierarchicalCopyAlgorithm(params.network_topology) + else: + if params.local_parameter_device == 'gpu': + devices_to_reduce_on = ['/gpu:%d' % i for i in range(params.num_gpus)] + else: + devices_to_reduce_on = ['/cpu:0'] + return CopyToDeviceAlgorithm(devices_to_reduce_on) + + +def _apply_to_all_device_tensors(all_device_tensors, apply_func, colocate=True): + """Applies a function to each tensor in `all_device_tensors`. + + A new list of lists of tensors is returned, where every tensor in + `all_device_tensors` has had `apply_func` called on it. `all_device_tensors` + is not modified. + + Args: + all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is + a tensor where `i` is the device index and `j` is the tensor index. + apply_func: A function taking in three arguments: tensor, device_index, + tensor_index, and returning a modified tensor. + `tensor` is `all_device_tensors[device_index][tensor_index]`. + colocate: If True, apply_func will be run under context manager colocated + with it's input tensor. + Returns: + A list in the same form as `all_device_tensors`, except each tensor has had + `apply_func` called on it. + """ + new_all_device_tensors = [] + for device_index, device_tensors in enumerate(all_device_tensors): + new_device_tensors = [] + for tensor_index, t in enumerate(device_tensors): + if colocate: + with tf.colocate_with(t): + new_t = apply_func(t, device_index, tensor_index) + else: + new_t = apply_func(t, device_index, tensor_index) + new_device_tensors.append(new_t) + new_all_device_tensors.append(new_device_tensors) + return new_all_device_tensors + + +def _defer_tensor(tensor): + """Defers the retrieval of a tensor. + + The tensor is put into a StagingArea, and the return value is the + retrieval of the tensor from the StagingArea. The effect is that the + tensor returned from this function is the tensor that was put in the + StagingArea for the previous Session.run() call. + + Args: + tensor: The tensor to defer for one step. + + Returns: + deferred_tensor: The tensor deferred for one step. + put_op: An op to put `tensor` in the StagingArea. Must be run every step + that `deferred_tensor` is run. + warmup_op: A warmup op that should be called before the first step. Puts + a zero tensor into the StagingArea. + """ + tensor_stage = data_flow_ops.StagingArea([tensor.dtype], [tensor.shape]) + put_op = tensor_stage.put([tensor]) + warmup_op = tensor_stage.put([tf.zeros(tensor.shape, dtype=tensor.dtype)]) + + # Fetch the next tensor to use. + (tensor,) = tensor_stage.get() + return tensor, put_op, warmup_op + + +def defer_single_device_tensors(device_tensors): + """Defer tensors (gradients in this case) from a single device. + + Arguments: + device_tensors: A list of gradients tensors from a single device to defer. + + Returns: + deferred_tensors: A list of tensors deferred for one step. + put_ops: A list of ops that put `tensors` in the StagingAreas. Must be run + every step that `deferred_tensors` is run. + warmup_ops: Warmup ops that should be called before the first step. Puts + zero tensors into the StagingArea. + """ + put_ops = [] + warmup_ops = [] + deferred_tensors = [] + + for tensor in device_tensors: + deferred_tensor, put_op, warmup_op = _defer_tensor(tensor) + deferred_tensors.append(deferred_tensor) + put_ops.append(put_op) + warmup_ops.append(warmup_op) + + return deferred_tensors, put_ops, warmup_ops + + +def _add_put_op_control_deps(all_device_tensors, num_splits, put_ops): + """Add control dependencies from `put_ops` to `all_device_tensors`. + + This should only be called when deferred tensors are being used. + + The control dependencies are added so that the put ops are run whenever + `all_device_tensors` is run. That way, the caller does not have to explicitly + run the put ops. + + Args: + all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is + a tensor where `i` is the device index and `j` is the tensor index. + num_splits: The number of splits that were used for the all-reduce. + put_ops: A list of put ops from deferring the tensors. + Returns: + A list in the same form as `all_device_tensors`, except each tensor has a + control dependency on an op in `put_ops`. + + """ + def apply_func(tensor, device_index, tensor_index): + if num_splits == 0: + deps = [put_ops[device_index][tensor_index]] + else: + deps = put_ops[device_index] + assert len(deps) == 1 + with tf.control_dependencies(deps): + return tf.identity(tensor, name='control_dependency') + return _apply_to_all_device_tensors(all_device_tensors, apply_func) + + +class _TensorPacker(object): + """Packs and unpacks tensors into groups. + + This class first concatenates a set of tensors, then split the concatenated + tensor into a small number of chunks. This is useful for all-reducing tensors, + as doing a small number of all-reduces on large tensors can be faster than + doing a large number of all-reduces on small tensors. + + It also provides option to compact tensors by casting them to fp16, for better + all-reduce performance. + + This class maintains states of processed tensors like shapes and types. So + each packer can only be used to pack and unpack one list of tensors. If you + need to pack multiple lists of tensors (say from multiple devices), then you + need multiple _TensorPacker object, one for each device. + """ + + def __init__(self, num_splits, compact): + """Initializes the _TensorPacker. + + Arguments: + num_splits: The number of tensors to split the concatenated tensor into. + The batch all-reduce will consist of `num_splits` all-reduces. if None + or zero, tensors are not split or concatenated. + compact: If True, tensors are casted to fp16 during packing and casted + back to their original dtypes during unpacking. + """ + self._num_splits = num_splits + self._compact = compact + self._before_compact_dtypes = [] + + def maybe_concat_tensors(self, device_tensors): + """Concatenate tensors into a single tensor.""" + if not self._num_splits: + return device_tensors + + flat_tensors = [tf.reshape(t, [-1]) for t in device_tensors] + self._orig_shapes = [t.shape for t in device_tensors] + self._orig_sizes = [s.num_elements() for s in self._orig_shapes] + # All shapes must be fully defined. + assert None not in self._orig_sizes + concatenated_grad = tf.concat(flat_tensors, 0) + return [concatenated_grad] + + def maybe_split_tensors(self, concatenated_tensor): + """Split concatenated tensor into `num_splits` pieces.""" + if not self._num_splits: + return concatenated_tensor + + if len(concatenated_tensor) != 1: + raise RuntimeError('tensors must be concatenated via ' + 'maybe_concat_tensors() before splitting') + + concatenated_tensor = concatenated_tensor[0] + total_tensor_size = concatenated_tensor.shape.num_elements() + split_size = total_tensor_size // self._num_splits + split_size_last = total_tensor_size - split_size * (self._num_splits - 1) + split_sizes = [split_size] * (self._num_splits - 1) + [split_size_last] + tensor_packs = tf.split(concatenated_tensor, split_sizes) + return tensor_packs + + def undo_maybe_split_tensors(self, tensor_packs): + """Undo maybe_split_tensors().""" + if not self._num_splits: + return tensor_packs + + return [tf.concat(tensor_packs, 0)] + + def undo_maybe_concat_tensors(self, concatenated_tensor): + """Undo maybe_concat_tensors().""" + if not self._num_splits: + return concatenated_tensor + + if len(concatenated_tensor) != 1: + raise RuntimeError( + 'undo_maybe_split_tensors() must be called before ' + 'undo_maybe_concat_tensors when num_splits is greater than 1') + concatenated_tensor = concatenated_tensor[0] + + tensors_with_sizes = tf.split(concatenated_tensor, + self._orig_sizes) + tensors_with_shapes = [ + tf.reshape(grad, shape) for grad, shape in zip( + tensors_with_sizes, self._orig_shapes) + ] + return tensors_with_shapes + + def maybe_compact_tensors(self, device_tensors): + """Cast tensors to fp16 and store their original types.""" + if not self._compact: + return device_tensors + + if self._before_compact_dtypes: + raise RuntimeError('maybe_compact_tensors can only be called once.') + + self._before_compact_dtypes = [t.dtype for t in device_tensors] + compact_tensors = [tf.cast(t, tf.float16) for t in device_tensors] + + return compact_tensors + + def undo_maybe_compact_tensors(self, compact_tensors): + """Undo maybe_compact_tensors().""" + if not self._compact: + return compact_tensors + + if not self._before_compact_dtypes: + raise RuntimeError('maybe_compact_tensors() must be called before ' + 'undo_maybe_compact_tensors()') + + device_tensors = [ + tf.cast(t, dtype) + for t, dtype in zip(compact_tensors, self._before_compact_dtypes) + ] + return device_tensors diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn.py b/cv/classification/resnet50/tensorflow/benchmark_cnn.py new file mode 100644 index 0000000000000000000000000000000000000000..6f65ea69b46f479a649c81aaddc797f30809c1ae --- /dev/null +++ b/cv/classification/resnet50/tensorflow/benchmark_cnn.py @@ -0,0 +1,3554 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""TensorFlow benchmark library. + +See the README for more information. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +from collections import namedtuple +import contextlib +import math +import multiprocessing +import os +import re +import threading +import time +import traceback + +from absl import flags as absl_flags +import numpy as np + +import six +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow.compat.v1 as tf + +# pylint: disable=g-direct-tensorflow-import +import cnn_util +import constants +import datasets +import flags +import mlperf +import variable_mgr +import variable_mgr_util +from cnn_util import log_fn +from models import model_config +from platforms import util as platforms_util +from google.protobuf import text_format +from tensorflow.core.protobuf import rewriter_config_pb2 +from tensorflow.python import debug as tf_debug +from tensorflow.python.client import timeline +from tensorflow.python.framework import graph_util +from tensorflow.python.framework import graph_util_impl +from tensorflow.python.framework import importer +from tensorflow.python.ops import data_flow_ops +from tensorflow.python.platform import gfile +from tensorflow.python.util import nest + + +_DEFAULT_NUM_BATCHES = 100 + + +# GraphInfo encapsulates the tensors/ops that we care about after building a +# graph. We use them to benchmark the graph. +GraphInfo = namedtuple( # pylint: disable=invalid-name + 'GraphInfo', + [ + # Ops that produce the input batches (before preprocessing). + 'input_producer_op', + # Ops that adds the preprocessed images to the staging areas + 'enqueue_ops', + # Fetches of sess.run() + 'fetches', + # Op that performs synchronization in distributed mode + 'execution_barrier', + # The global step variable + 'global_step', + # Group of ops that perform per-device initialization work + 'local_var_init_op_group', + # Op to produce summaries + 'summary_op' + ]) + + +# InputProcessingInfo contains various sources of inputs which will be later fed +# into the model. If synthetic data is used, all three fields are None. +InputProcessingInfo = namedtuple( + 'InputProcessingInfo', + [ + # The first two fields are non-None iff datasets prefetching is not + # used. + + # Ops that produce the input batches. + 'input_producer_op', + # A list of StagingArea for each device. + 'input_producer_stages', + + # Input produced using multi device iterator. Non-None iff datasets + # prefetching is used + 'multi_device_iterator_input' + ]) + + +# A string specifying the npy file postfix for saving predicted logits. +flags.DEFINE_string('save_dir', '.', 'The dir to which the predicted logits npy file will be saved.') + +# TODO(reedwm): add upper_bound and lower_bound to appropriate integer and +# float flags, and change certain string flags to enum flags. + +flags.DEFINE_string('model', 'trivial', + 'Name of the model to run, the list of supported models ' + 'are defined in models/model.py') +# The code will first check if it's running under benchmarking mode +# or evaluation mode, depending on 'eval': +# Under the evaluation mode, this script will read a saved model, +# and compute the accuracy of the model against a validation dataset. +# Additional ops for accuracy and top_k predictors are only used under +# this mode. +# Under the benchmarking mode, user can specify whether nor not to use +# the forward-only option, which will only compute the loss function. +# forward-only cannot be enabled with eval at the same time. +flags.DEFINE_boolean('eval', False, 'whether use eval or benchmarking') +flags.DEFINE_integer('eval_interval_secs', 0, + 'How often to run eval on saved checkpoints. Usually the ' + 'same as save_model_secs from the corresponding training ' + 'run. Pass 0 to eval only once.') +flags.DEFINE_integer('eval_during_training_every_n_steps', None, + 'Every n steps during training, pause training, run ' + 'evaluation, then resume training. Must not be used with ' + '--eval, as unlike --eval, this option causes both ' + 'training and eval to be done. This may take slightly ' + 'more GPU memory than running just training or evaluation ' + 'alone. It also may slightly slow down training, even ' + 'when not taking into account the additional time to ' + 'evaluate.', lower_bound=1) +flags.DEFINE_float('eval_during_training_every_n_epochs', None, + 'After every n training epochs, pause training, run ' + 'evaluation, then resume training. See ' + '--eval_during_training_every_n_steps for more information.') +flags.DEFINE_list('eval_during_training_at_specified_steps', [], + 'Specify a list of training steps, pause training at each of ' + 'these steps, run evaluation, then resume training. See ' + '--eval_during_training_every_n_steps for more information.') +flags.DEFINE_list('eval_during_training_at_specified_epochs', [], + 'Specify a list of training epochs, pause training after ' + 'each of these epochs, run evaluation, then resume training. ' + 'See --eval_during_training_every_n_steps for more ' + 'information.') +flags.DEFINE_boolean('forward_only', False, + 'whether use forward-only or training for benchmarking') +flags.DEFINE_boolean('freeze_when_forward_only', False, + 'whether to freeze the graph when in forward-only mode.') +flags.DEFINE_boolean('print_training_accuracy', False, + 'whether to calculate and print training accuracy during ' + 'training') +flags.DEFINE_integer('batch_size', 0, 'batch size per compute device') +flags.DEFINE_integer('eval_batch_size', 0, 'eval batch size per compute device') +flags.DEFINE_integer('batch_group_size', 1, + 'number of groups of batches processed in the image ' + 'producer.') +flags.DEFINE_integer('num_batches', None, 'number of batches to run, excluding ' + 'warmup. Defaults to %d' % _DEFAULT_NUM_BATCHES) +flags.DEFINE_integer('num_eval_batches', None, + 'number of eval batches to run, excluding warmup. ' + 'Defaults to --num_batches') +flags.DEFINE_float('num_epochs', 90, + 'number of epochs to run, excluding warmup. ' + 'This and --num_batches cannot both be specified.') +flags.DEFINE_float('num_eval_epochs', None, + 'number of eval epochs to run, excluding warmup. ' + 'Defaults to --num_epochs') +flags.DEFINE_float('stop_at_top_1_accuracy', None, + 'If set, stops training after the evaluation accuracy hits ' + 'this number. Can only be used with one of the ' + '--eval_during_training_* flags.') +flags.DEFINE_boolean('collect_eval_results_async', False, + 'If True, start a separate process to postprocess eval ' + 'results asynchronously. This currently only works with ' + 'the SSD model.') +flags.DEFINE_integer('num_warmup_batches', None, + 'number of batches to run before timing') +flags.DEFINE_integer('autotune_threshold', None, + 'The autotune threshold for the models') +# TODO(tucker): change num_gpus to num_devices +flags.DEFINE_integer('num_gpus', 1, 'the number of GPUs to run on') +flags.DEFINE_string('gpu_indices', '', 'indices of worker GPUs in ring order') +flags.DEFINE_integer('display_every', 10, + 'Number of local steps after which progress is printed ' + 'out') +flags.DEFINE_float('display_perf_ewma', None, + 'If set, display numbers of images/sec using exponentially ' + 'weighted moving avearge with the specified weight, which ' + 'defines how much current value contributes to the reported ' + 'average. Increasing weight makes the reported performance ' + 'number reflect more about the real-time speed instead of ' + 'the entire history', lower_bound=0, upper_bound=1) +flags.DEFINE_string('data_dir', None, + 'Path to dataset in TFRecord format (aka Example ' + 'protobufs). If not specified, synthetic data will be ' + 'used.') +flags.DEFINE_string('data_name', None, + 'Name of dataset: imagenet or cifar10. If not specified, ' + 'it is automatically guessed based on data_dir.') +flags.DEFINE_string('resize_method', 'bilinear', + 'Method for resizing input images: crop, nearest, ' + 'bilinear, bicubic, area, or round_robin. The `crop` mode ' + 'requires source images to be at least as large as the ' + 'network input size. The `round_robin` mode applies ' + 'different resize methods based on position in a batch in ' + 'a round-robin fashion. Other modes support any sizes and ' + 'apply random bbox distortions before resizing (even with ' + 'distortions=False).') +flags.DEFINE_boolean('distortions', False, + 'Enable/disable distortions during image preprocessing. ' + 'These include bbox and color distortions.') +flags.DEFINE_boolean('use_datasets', True, + 'Enable use of datasets for input pipeline') +flags.DEFINE_string('input_preprocessor', 'default', + 'Name of input preprocessor. The list of supported input ' + 'preprocessors are defined in preprocessing.py.') +flags.DEFINE_string('gpu_thread_mode', 'gpu_private', + 'Methods to assign GPU host work to threads. ' + 'global: all GPUs and CPUs share the same global threads; ' + 'gpu_private: a private threadpool for each GPU; ' + 'gpu_shared: all GPUs share the same threadpool.') +flags.DEFINE_integer('per_gpu_thread_count', 0, + 'The number of threads to use for GPU. Only valid when ' + 'gpu_thread_mode is not global.') +flags.DEFINE_boolean('hierarchical_copy', False, + 'Use hierarchical copies. Currently only optimized for ' + 'use on a DGX-1 with 8 GPUs and may perform poorly on ' + 'other hardware. Requires --num_gpus > 1, and only ' + 'recommended when --num_gpus=8') +# TODO(hinsu): Support auto-detection of the network topology while still +# retaining the ability to specify a particular topology for debugging. +flags.DEFINE_enum( + 'network_topology', constants.NetworkTopology.DGX1, + (constants.NetworkTopology.DGX1, constants.NetworkTopology.GCP_V100), + 'Network topology specifies the topology used to connect multiple devices. ' + 'Network topology is used to decide the hierarchy to use for the ' + 'hierarchical_copy.') +flags.DEFINE_integer('gradient_repacking', 0, 'Use gradient repacking. It' + 'currently only works with replicated mode. At the end of' + 'of each step, it repacks the gradients for more efficient' + 'cross-device transportation. A non-zero value specifies' + 'the number of split packs that will be formed.', + lower_bound=0) +flags.DEFINE_boolean('compact_gradient_transfer', True, 'Compact gradient' + 'as much as possible for cross-device transfer and ' + 'aggregation.') +flags.DEFINE_enum('variable_consistency', 'strong', ('strong', 'relaxed'), + 'The data consistency for trainable variables. With strong ' + 'consistency, the variable always have the updates from ' + 'previous step. With relaxed consistency, all the updates ' + 'will eventually show up in the variables. Likely one step ' + 'behind.') +flags.DEFINE_boolean('datasets_repeat_cached_sample', False, + 'Enable use of a special datasets pipeline that reads a ' + 'single TFRecord into memory and repeats it infinitely ' + 'many times. The purpose of this flag is to make it ' + 'possible to write regression tests that are not ' + 'bottlenecked by CNS throughput. ' + 'Use datasets_use_caching to cache input data.') +flags.DEFINE_enum('local_parameter_device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'), + 'Device to use as parameter server: cpu or gpu. For ' + 'distributed training, it can affect where caching of ' + 'variables happens.') +flags.DEFINE_enum('device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'), + 'Device to use for computation: cpu or gpu') +flags.DEFINE_enum('data_format', 'NCHW', ('NHWC', 'NCHW'), + 'Data layout to use: NHWC (TF native) or NCHW (cuDNN ' + 'native, requires GPU).') +flags.DEFINE_integer('num_intra_threads', None, + 'Number of threads to use for intra-op parallelism. If ' + 'set to 0, the system will pick an appropriate number. ' + 'None is the same as 0 except that it disables intra-op ' + 'parallelism on a GPU.') +flags.DEFINE_integer('num_inter_threads', 0, + 'Number of threads to use for inter-op parallelism. If ' + 'set to 0, the system will pick an appropriate number.') +flags.DEFINE_boolean('use_numa_affinity', False, + 'Whether to turn on NUMA affinity for CPU devices. ' + 'This is probably only useful when --device=cpu.') +flags.DEFINE_string('trace_file', '', + 'Enable TensorFlow tracing and write trace to this file.') +flags.DEFINE_boolean('use_chrome_trace_format', True, + 'If True, the trace_file, if specified, will be in a ' + 'Chrome trace format. If False, then it will be a ' + 'StepStats raw proto.') +flags.DEFINE_boolean('use_deep_stem', False, + 'If True, use deep stem style (replace 7*7 conv to 3 3*3 conv) ' + 'Resnet model only') +_NUM_STEPS_TO_PROFILE = 10 +_NUM_OPS_TO_PRINT = 20 +flags.DEFINE_string('tfprof_file', None, + 'If specified, write a tfprof ProfileProto to this file. ' + 'The performance and other aspects of the model can then ' + 'be analyzed with tfprof. See ' + 'https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/g3doc/command_line.md ' # pylint: disable=line-too-long + 'for more info on how to do this. The first %d steps ' + 'are profiled. Additionally, the top %d most time ' + 'consuming ops will be printed.\n' + 'Note: profiling with tfprof is very slow, but most of the ' + 'overhead is spent between steps. So, profiling results ' + 'are more accurate than the slowdown would suggest.' % + (_NUM_STEPS_TO_PROFILE, _NUM_OPS_TO_PRINT)) +flags.DEFINE_string('graph_file', None, + 'Write the model\'s graph definition to this file. ' + 'Defaults to binary format unless filename ends in "txt".') +flags.DEFINE_string('partitioned_graph_file_prefix', None, + 'If specified, after the graph has been partitioned and ' + 'optimized, write out each partitioned graph to a file ' + 'with the given prefix.') +flags.DEFINE_enum('optimizer', 'sgd', ('momentum', 'sgd', 'rmsprop', 'adam'), + 'Optimizer to use') +flags.DEFINE_float('init_learning_rate', None, + 'Initial learning rate for training.') +flags.DEFINE_string('piecewise_learning_rate_schedule', None, + 'Specifies a piecewise learning rate schedule based on the ' + 'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, ' + 'where each LRi is a learning rate and each Ei is an epoch ' + 'indexed from 0. The learning rate is LRi if the ' + 'E(i-1) <= current_epoch < Ei. For example, if this ' + 'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 ' + 'for the first 10 epochs, then is 0.2 for the next 15 ' + 'epochs, then is 0.1 until training ends.') +flags.DEFINE_float('num_epochs_per_decay', 0, + 'Steps after which learning rate decays. If 0, the learning ' + 'rate does not decay.') +flags.DEFINE_float('learning_rate_decay_factor', 0, + 'Learning rate decay factor. Decay by this factor every ' + '`num_epochs_per_decay` epochs. If 0, learning rate does ' + 'not decay.') +flags.DEFINE_float('num_learning_rate_warmup_epochs', 0, + 'Slowly increase to the initial learning rate in the first ' + 'num_learning_rate_warmup_epochs linearly.') +flags.DEFINE_float('minimum_learning_rate', 0, + 'The minimum learning rate. The learning rate will ' + 'never decay past this value. Requires `learning_rate`, ' + '`num_epochs_per_decay` and `learning_rate_decay_factor` to ' + 'be set.') +flags.DEFINE_float('resnet_base_lr', None, "Base learning rate at bs=256. Only " + "relevant when training ResNet and utilizing the model's " + "learning rate heuristic (get_learning_rate).") +flags.DEFINE_float('momentum', 0.9, 'Momentum for training.') +flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.') +flags.DEFINE_float('rmsprop_momentum', 0.9, 'Momentum in RMSProp.') +flags.DEFINE_float('rmsprop_epsilon', 1.0, 'Epsilon term for RMSProp.') +flags.DEFINE_float('adam_beta1', 0.9, 'Beta2 term for the Adam optimizer') +flags.DEFINE_float('adam_beta2', 0.999, 'Beta2 term for the Adam optimizer') +flags.DEFINE_float('adam_epsilon', 1e-8, 'Epsilon term for the Adam optimizer') +flags.DEFINE_float('gradient_clip', None, + 'Gradient clipping magnitude. Disabled by default.') +flags.DEFINE_float('weight_decay', 0.00004, + 'Weight decay factor for training.') +flags.DEFINE_float('gpu_memory_frac_for_testing', 0, + 'If non-zero, the fraction of GPU memory that will be used. ' + 'Useful for testing the benchmark script, as this allows ' + 'distributed mode to be run on a single machine. For ' + 'example, if there are two tasks, each can be allocated ' + '~40 percent of the memory on a single machine. This is ' + 'also useful for using unified memory, as this can be set ' + 'above 1 to oversubscribe the GPU using unified memory.', + lower_bound=0.) +flags.DEFINE_boolean('use_unified_memory', None, + 'If True, allocate unified memory enabling larger models ' + 'to fit in available device RAM.') +flags.DEFINE_boolean('timestamped_allocator', False, + 'If True marks free BFCAllocator::Chunks with time ' + 'at which they are freed which can allow more efficient ' + 'memory allocation in cases like RDMA networking.') +flags.DEFINE_integer('gpu_kt_max_interval', 0, + 'If > 0, the maximum number of GPU Ops that may be queued ' + 'in a row without also queuing a tracking event.') +flags.DEFINE_integer('gpu_kt_max_bytes', 0, + 'If > 0, the maximum number of bytes ' + 'of GPU memory that may be allocated by sequential ' + 'GPU Ops without queuing a tracking event.') +flags.DEFINE_integer('gpu_kt_max_pending', 0, + 'If > 0 no more than this many GPU tracking events may be ' + 'outstanding at any time. When this limit is reached ' + 'launch of additional kernels will stall until an ' + 'outstanding event completes.') +flags.DEFINE_boolean('use_tf_layers', True, + 'If True, use tf.layers for neural network layers. This ' + 'should not affect performance or accuracy in any way.') +flags.DEFINE_integer('tf_random_seed', 1234, + 'The TensorFlow random seed. Useful for debugging NaNs, ' + 'as this can be set to various values to see if the NaNs ' + 'depend on the seed.') +flags.DEFINE_string('debugger', None, + 'If set, use the TensorFlow debugger. If set to "cli", use ' + 'the local CLI debugger. Otherwise, this must be in the ' + 'form hostname:port (e.g., localhost:7007) in which case ' + 'the experimental TensorBoard debugger will be used') +flags.DEFINE_boolean('use_python32_barrier', False, + 'When on, use threading.Barrier at Python 3.2.') + +flags.DEFINE_boolean('ml_perf', False, + 'When True, change how the Imagenet input pipeline works ' + 'slightly to meet the MLPerf compliance rules. This slows ' + 'down the input pipeline. Without this option, at the end ' + 'of the input pipeline, the image is divided by 127.5, ' + 'then 1.0 is subtracted from it, bringing the image ' + 'values from [0, 255] to [-1.0, 1.0]. With this option, ' + 'each of the three channels (red, green, blue) have the ' + 'average channel value among all image subtracted from ' + 'it, and no division is done.') + +flags.DEFINE_boolean('datasets_use_prefetch', True, + 'Enable use of prefetched datasets for input pipeline. ' + 'This option is meaningless if use_datasets=False.') +flags.DEFINE_integer('datasets_prefetch_buffer_size', 1, + 'Prefetching op buffer size per compute device.') +flags.DEFINE_integer('datasets_num_private_threads', None, + 'Number of threads for a private threadpool created for ' + 'all datasets computation. By default, we pick an ' + 'appropriate number. If set to 0, we use the default ' + 'tf-Compute threads for dataset operations.') +flags.DEFINE_boolean('datasets_use_caching', False, + 'Cache the compressed input data in memory. This improves ' + 'the data input performance, at the cost of additional ' + 'memory.') +flags.DEFINE_integer('datasets_parallel_interleave_cycle_length', None, + 'Number of parallel file readers interleaving input data.') +flags.DEFINE_boolean('datasets_sloppy_parallel_interleave', False, + 'Allow parallel interleave to depart from deterministic ' + 'ordering, by temporarily skipping over files whose ' + 'elements are not readily available. This can increase ' + 'througput in particular in the presence of stragglers.') +flags.DEFINE_integer('datasets_parallel_interleave_prefetch', None, + 'The number of input elements to fetch before they are ' + 'needed for interleaving.') + +flags.DEFINE_integer( + 'multi_device_iterator_max_buffer_size', 1, + 'Configuration parameter for the MultiDeviceIterator that ' + ' specifies the host side buffer size for each device.') + +# Performance tuning parameters. +flags.DEFINE_boolean('winograd_nonfused', True, + 'Enable/disable using the Winograd non-fused algorithms.') +flags.DEFINE_boolean( + 'batchnorm_persistent', True, + 'Enable/disable using the CUDNN_BATCHNORM_SPATIAL_PERSISTENT ' + 'mode for batchnorm.') +flags.DEFINE_boolean('sync_on_finish', False, + 'Enable/disable whether the devices are synced after each ' + 'step.') +flags.DEFINE_boolean('staged_vars', False, + 'whether the variables are staged from the main ' + 'computation') +flags.DEFINE_boolean('force_gpu_compatible', False, + 'whether to enable force_gpu_compatible in GPU_Options') +flags.DEFINE_boolean('allow_growth', None, + 'whether to enable allow_growth in GPU_Options') +flags.DEFINE_boolean('xla', False, 'whether to enable XLA auto-jit compilation') +flags.DEFINE_boolean('xla_compile', False, + 'Enable xla to compile the graph. Uncompilable ops will ' + 'result in fatal errors.') +flags.DEFINE_boolean('fuse_decode_and_crop', True, + 'Fuse decode_and_crop for image preprocessing.') +flags.DEFINE_boolean('distort_color_in_yiq', True, + 'Distort color of input images in YIQ space.') +flags.DEFINE_boolean('enable_optimizations', True, + 'Whether to enable grappler and other optimizations.') +flags.DEFINE_string('rewriter_config', None, + 'Config for graph optimizers, described as a ' + 'RewriterConfig proto buffer.') +flags.DEFINE_enum('loss_type_to_report', 'total_loss', + ('base_loss', 'total_loss'), + 'Which type of loss to output and to write summaries for. ' + 'The total loss includes L2 loss while the base loss does ' + 'not. Note that the total loss is always used while ' + 'computing gradients during training if weight_decay > 0, ' + 'but explicitly computing the total loss, instead of just ' + 'computing its gradients, can have a performance impact.') +flags.DEFINE_boolean('single_l2_loss_op', False, + 'If True, instead of using an L2 loss op per variable, ' + 'concatenate the variables into a single tensor and do a ' + 'single L2 loss on the concatenated tensor.') +flags.DEFINE_boolean('use_resource_vars', False, + 'Use resource variables instead of normal variables. ' + 'Resource variables are slower, but this option is useful ' + 'for debugging their performance.') +flags.DEFINE_boolean('compute_lr_on_cpu', False, + 'If True, do computations related to learning rate on the ' + 'CPU instead of the GPU. This will significantly improve ' + 'XLA performance in some cases.') +flags.DEFINE_boolean('sparse_to_dense_grads', False, + 'If True, convert all sparse gradients to dense gradients ' + 'before passing them to the optimizer to update ' + 'variables. Only affects models with sparse gradients, ' + 'which currently is only the NCF model.') +# Performance tuning specific to MKL. +flags.DEFINE_boolean('mkl', False, 'If true, set MKL environment variables.') +flags.DEFINE_integer('kmp_blocktime', 0, + 'The time, in milliseconds, that a thread should wait, ' + 'after completing the execution of a parallel region, ' + 'before sleeping') +flags.DEFINE_string('kmp_affinity', 'granularity=fine,verbose,compact,1,0', + 'Restricts execution of certain threads (virtual execution ' + 'units) to a subset of the physical processing units in a ' + 'multiprocessor computer.') +flags.DEFINE_integer('kmp_settings', 1, + 'If set to 1, MKL settings will be printed.') + +# fp16 parameters. If use_fp16=False, no other fp16 parameters apply. +flags.DEFINE_boolean('use_fp16', False, + 'Use 16-bit floats for certain tensors instead of 32-bit ' + 'floats. This is currently experimental.') +# TODO(reedwm): The default loss scale of 128 causes most models to diverge +# on the second step with synthetic data. Changing the tf.set_random_seed +# call to tf.set_random_seed(1235) or most other seed values causes the +# issue not to occur. +flags.DEFINE_float('fp16_loss_scale', None, + 'If fp16 is enabled, the loss is multiplied by this amount ' + 'right before gradients are computed, then each gradient ' + 'is divided by this amount. Mathematically, this has no ' + 'effect, but it helps avoid fp16 underflow. Set to 1 to ' + 'effectively disable. Ignored during eval.') +flags.DEFINE_boolean('fp16_vars', False, + 'If fp16 is enabled, also use fp16 for variables. If ' + 'False, the variables are stored in fp32 and casted to ' + 'fp16 when retrieved. Recommended to leave as False.') +flags.DEFINE_boolean('fp16_enable_auto_loss_scale', False, + 'If True and use_fp16 is True, automatically adjust the ' + 'loss scale during training.') +flags.DEFINE_integer('fp16_inc_loss_scale_every_n', 1000, + 'If fp16 is enabled and fp16_enable_auto_loss_scale is ' + 'True, increase the loss scale every n steps.') + +# The method for managing variables: +# parameter_server: variables are stored on a parameter server that holds +# the master copy of the variable. In local execution, a local device +# acts as the parameter server for each variable; in distributed +# execution, the parameter servers are separate processes in the +# cluster. +# For each step, each tower gets a copy of the variables from the +# parameter server, and sends its gradients to the param server. +# replicated: each GPU has its own copy of the variables. To apply +# gradients, an all_reduce algorithm or or regular cross-device +# aggregation is used to replicate the combined gradients to all +# towers (depending on all_reduce_spec parameter setting). +# independent: each GPU has its own copy of the variables, and gradients +# are not shared between towers. This can be used to check performance +# when no data is moved between GPUs. +# distributed_replicated: Distributed training only. Each GPU has a copy +# of the variables, and updates its copy after the parameter servers +# are all updated with the gradients from all servers. Only works with +# cross_replica_sync=true. Unlike 'replicated', currently never uses +# nccl all-reduce for replicating within a server. +# distributed_all_reduce: Distributed training where all replicas run +# in a single session, using all-reduce to mutally reduce the +# gradients. Uses no parameter servers. When there is only one +# worker, this is the same as replicated. +# collective_all_reduce: Distributed training where all replicas run +# independepently except for variable initialization and for +# gradient reduction which is done via collective all-reduce. +# NOTE: collective_all_reduce in conjunction with use_fp16 can +# lead to NaNs in some models (resnet50). TODO(tucker): fix it. +# horovod: Distributed training using Horovod library. Runs workers using +# an MPI framework (e.g. Open MPI). Each worker runs training on +# single GPU, and averages gradients using NCCL or MPI all-reduce. +# See https://github.com/uber/horovod for more details. +flags.DEFINE_enum('variable_update', 'parameter_server', + ('parameter_server', 'replicated', 'distributed_replicated', + 'independent', 'distributed_all_reduce', + 'collective_all_reduce', 'horovod'), + 'The method for managing variables: parameter_server, ' + 'replicated, distributed_replicated, independent, ' + 'distributed_all_reduce, collective_all_reduce, horovod') +flags.DEFINE_string('all_reduce_spec', None, + 'A specification of the all_reduce algorithm to be used ' + 'for reducing gradients. For more details, see ' + 'parse_all_reduce_spec in variable_mgr.py. An ' + 'all_reduce_spec has BNF form:\n' + 'int ::= positive whole number\n' + 'g_int ::= int[KkMGT]?\n' + 'alg_spec ::= alg | alg#int\n' + 'range_spec ::= alg_spec | alg_spec/alg_spec\n' + 'spec ::= range_spec | range_spec:g_int:range_spec\n' + 'NOTE: not all syntactically correct constructs are ' + 'supported.\n\n' + 'Examples:\n ' + '"xring" == use one global ring reduction for all ' + 'tensors\n' + '"pscpu" == use CPU at worker 0 to reduce all tensors\n' + '"nccl" == use NCCL to locally reduce all tensors. ' + 'Limited to 1 worker.\n' + '"nccl/xring" == locally (to one worker) reduce values ' + 'using NCCL then ring reduce across workers.\n' + '"pscpu:32k:xring" == use pscpu algorithm for tensors of ' + 'size up to 32kB, then xring for larger tensors.') + +# If variable_update==distributed_all_reduce then it may be advantageous +# to aggregate small tensors into one prior to reduction. These parameters +# control that aggregation. +flags.DEFINE_integer('agg_small_grads_max_bytes', 0, + 'If > 0, try to aggregate tensors of less than this ' + 'number of bytes prior to all-reduce.') +flags.DEFINE_integer('agg_small_grads_max_group', 10, + 'When aggregating small tensors for all-reduce do not ' + 'aggregate more than this many into one new tensor.') +flags.DEFINE_integer('allreduce_merge_scope', 1, + 'Establish a name scope around this many ' + 'gradients prior to creating the all-reduce operations. ' + 'It may affect the ability of the backend to merge ' + 'parallel ops.') + +# Distributed training parameters. +flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''), + 'One of "ps", "worker", "controller", "". Empty for local ' + 'training') +flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts') +flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts') +flags.DEFINE_string('controller_host', None, 'optional controller host') +flags.DEFINE_integer('task_index', 0, 'Index of task within the job') +flags.DEFINE_string('server_protocol', 'grpc', 'protocol for servers') +flags.DEFINE_boolean('cross_replica_sync', True, '') +flags.DEFINE_string('horovod_device', '', 'Device to do Horovod all-reduce on: ' + 'empty (default), cpu or gpu. Default with utilize GPU if ' + 'Horovod was compiled with the HOROVOD_GPU_ALLREDUCE ' + 'option, and CPU otherwise.') + +# Summary and Save & load checkpoints. +flags.DEFINE_integer('summary_verbosity', 0, 'Verbosity level for summary ops. ' + 'level 0: disable any summary.\n' + 'level 1: small and fast ops, e.g.: learning_rate, ' + 'total_loss.\n' + 'level 2: medium-cost ops, e.g. histogram of all ' + 'gradients.\n' + 'level 3: expensive ops: images and histogram of each ' + 'gradient.\n') +flags.DEFINE_integer('save_summaries_steps', 0, + 'How often to save summaries for trained models. Pass 0 ' + 'to disable summaries.') +flags.DEFINE_integer('save_model_secs', 0, + 'How often to save trained models. Pass 0 to disable ' + 'saving checkpoints every N seconds. A checkpoint is ' + 'saved after training completes regardless of this ' + 'option.') +flags.DEFINE_integer('save_model_steps', None, + 'How often to save trained models. If specified, ' + 'save_model_secs must not be specified.') +flags.DEFINE_integer('max_ckpts_to_keep', 5, + 'Max number of checkpoints to keep.') +flags.DEFINE_string('train_dir', None, + 'Path to session checkpoints. Pass None to disable saving ' + 'checkpoint at the end.') +flags.DEFINE_string('eval_dir', '/tmp/tf_cnn_benchmarks/eval', + 'Directory where to write eval event logs.') +flags.DEFINE_string('backbone_model_path', None, + 'Path to pretrained backbone model checkpoint. Pass None ' + 'if not using a backbone model.') +flags.DEFINE_enum('trt_mode', '', ['', 'FP32', 'FP16', 'INT8'], + 'If this is specified in forward_only mode and ' + 'freeze_when_forward_only is set to True, use TensorRT to ' + 'optimize the graph before execution.') +flags.DEFINE_integer('trt_max_workspace_size_bytes', 4 << 30, + 'Max workspace size bytes used by the TensorRT optimizer.') + +# Benchmark logging for model garden metric +flags.DEFINE_string('benchmark_log_dir', None, + 'The directory to place the log files containing the ' + 'results of benchmark. The logs are created by ' + 'BenchmarkFileLogger. Requires the root of the Tensorflow ' + 'models repository to be in $PYTHTONPATH.') +flags.DEFINE_string('benchmark_test_id', None, + 'The unique test ID of the benchmark run. It could be the ' + 'combination of key parameters. It is hardware independent ' + 'and could be used compare the performance between ' + 'different test runs. This flag is designed for human ' + 'consumption, and does not have any impact within the ' + 'system.') + +platforms_util.define_platform_params() + + +class GlobalStepWatcher(threading.Thread): + """A helper class for global_step. + + Polls for changes in the global_step of the model, and finishes when the + number of steps for the global run are done. + """ + + def __init__(self, sess, global_step_op, start_at_global_step, + end_at_global_step): + threading.Thread.__init__(self) + self.sess = sess + self.global_step_op = global_step_op + self.start_at_global_step = start_at_global_step + self.end_at_global_step = end_at_global_step + + self.start_time = 0 + self.start_step = 0 + self.finish_time = 0 + self.finish_step = 0 + + def run(self): + while self.finish_time == 0: + time.sleep(.25) + global_step_val, = self.sess.run([self.global_step_op]) + if self.start_time == 0 and global_step_val >= self.start_at_global_step: + # Use tf.logging.info instead of log_fn, since print (which is log_fn) + # is not thread safe and may interleave the outputs from two parallel + # calls to print, which can break tests. + tf.logging.info('Starting real work at step %s at time %s' % + (global_step_val, time.ctime())) + self.start_time = time.time() + self.start_step = global_step_val + if self.finish_time == 0 and global_step_val >= self.end_at_global_step: + tf.logging.info('Finishing real work at step %s at time %s' % + (global_step_val, time.ctime())) + self.finish_time = time.time() + self.finish_step = global_step_val + + def done(self): + return self.finish_time > 0 + + def num_steps(self): + return self.finish_step - self.start_step + + def elapsed_time(self): + return self.finish_time - self.start_time + + +class CheckpointNotFoundException(Exception): + pass + + +def create_config_proto(params): + """Returns session config proto. + + Args: + params: Params tuple, typically created by make_params or + make_params_from_flags. + """ + config = tf.ConfigProto() + config.allow_soft_placement = True + if params.num_intra_threads is None: + if params.device == 'gpu': + config.intra_op_parallelism_threads = 1 + else: + config.intra_op_parallelism_threads = params.num_intra_threads + config.inter_op_parallelism_threads = params.num_inter_threads + config.experimental.collective_group_leader = '/job:worker/replica:0/task:0' + config.gpu_options.experimental.collective_ring_order = params.gpu_indices + config.gpu_options.force_gpu_compatible = params.force_gpu_compatible + config.experimental.use_numa_affinity = params.use_numa_affinity + if params.device == 'cpu': + # TODO(tucker): change num_gpus to num_devices + config.device_count['CPU'] = params.num_gpus + if params.allow_growth is not None: + config.gpu_options.allow_growth = params.allow_growth + if params.gpu_memory_frac_for_testing > 0: + config.gpu_options.per_process_gpu_memory_fraction = ( + params.gpu_memory_frac_for_testing) + if params.use_unified_memory: + config.gpu_options.experimental.use_unified_memory = ( + params.use_unified_memory) + if params.timestamped_allocator: + config.gpu_options.experimental.timestamped_allocator = ( + params.timestamped_allocator) + if params.gpu_kt_max_interval > 0: + config.gpu_options.experimental.kernel_tracker_max_interval = ( + params.gpu_kt_max_interval) + if params.gpu_kt_max_bytes > 0: + config.gpu_options.experimental.kernel_tracker_max_bytes = ( + params.gpu_kt_max_bytes) + if params.gpu_kt_max_pending > 0: + config.gpu_options.experimental.kernel_tracker_max_pending = ( + params.gpu_kt_max_pending) + if params.xla: + config.graph_options.optimizer_options.global_jit_level = ( + tf.OptimizerOptions.ON_1) + if params.rewriter_config: + rewriter_config = rewriter_config_pb2.RewriterConfig() + text_format.Merge(params.rewriter_config, rewriter_config) + config.graph_options.rewrite_options.CopyFrom(rewriter_config) + elif not params.enable_optimizations: + config.graph_options.optimizer_options.opt_level = tf.OptimizerOptions.L0 + config.graph_options.rewrite_options.disable_meta_optimizer = True + elif params.variable_update == 'collective_all_reduce': + rewrite_options = config.graph_options.rewrite_options + rewrite_options.scoped_allocator_optimization = ( + rewriter_config_pb2.RewriterConfig.ON) + rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce') + if params.variable_update == 'horovod': + import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top + config.gpu_options.visible_device_list = str(hvd.local_rank()) + # For collective_all_reduce, ignore all devices except current worker. + if params.variable_update == 'collective_all_reduce': + del config.device_filters[:] + config.device_filters.append( + '/job:%s/replica:0/task:%d' % (params.job_name, params.task_index)) + + # TODO(b/117324590): Re-enable PinToHostOptimizer when b/117324590 is fixed. + # Currently we have to disable PinToHostOptimizer w/ XLA since it causes + # OOM/perf cliffs. + config.graph_options.rewrite_options.pin_to_host_optimization = ( + rewriter_config_pb2.RewriterConfig.OFF) + return config + + +def get_mode_from_params(params): + """Returns the mode in which this script is running. + + Args: + params: Params tuple, typically created by make_params or + make_params_from_flags. + Raises: + ValueError: Unsupported params settings. + """ + if params.forward_only and params.eval: + raise ValueError('Only one of forward_only and eval parameters is true') + + if params.eval: + return constants.BenchmarkMode.EVAL + elif params.forward_only: + return constants.BenchmarkMode.FORWARD_ONLY + elif (params.eval_during_training_every_n_steps or + params.eval_during_training_every_n_epochs or + params.eval_during_training_at_specified_steps or + params.eval_during_training_at_specified_epochs): + return constants.BenchmarkMode.TRAIN_AND_EVAL + else: + return constants.BenchmarkMode.TRAIN + + +# How many digits to show for the loss and accuracies during training. +LOSS_AND_ACCURACY_DIGITS_TO_SHOW = 3 + + +def benchmark_one_step(sess, + fetches, + step, + batch_size, + step_train_times, + trace_filename, + partitioned_graph_file_prefix, + profiler, + image_producer, + params, + summary_op=None, + show_images_per_sec=True, + benchmark_logger=None, + collective_graph_key=0): + """Advance one step of benchmarking.""" + should_profile = profiler and 0 <= step < _NUM_STEPS_TO_PROFILE + need_options_and_metadata = ( + should_profile or collective_graph_key > 0 or + ((trace_filename or partitioned_graph_file_prefix) and step == -2) + ) + if need_options_and_metadata: + run_options = tf.RunOptions() + if (trace_filename and step == -2) or should_profile: + run_options.trace_level = tf.RunOptions.FULL_TRACE + if partitioned_graph_file_prefix and step == -2: + run_options.output_partition_graphs = True + if collective_graph_key > 0: + run_options.experimental.collective_graph_key = collective_graph_key + run_metadata = tf.RunMetadata() + else: + run_options = None + run_metadata = None + summary_str = None + start_time = time.time() + if summary_op is None: + results = sess.run(fetches, options=run_options, run_metadata=run_metadata) + else: + (results, summary_str) = sess.run( + [fetches, summary_op], options=run_options, run_metadata=run_metadata) + + if not params.forward_only: + lossval = results['average_loss'] + else: + lossval = 0. + if image_producer is not None: + image_producer.notify_image_consumption() + train_time = time.time() - start_time + step_train_times.append(train_time) + if (show_images_per_sec and step >= 0 and + (step == 0 or (step + 1) % params.display_every == 0)): + speed_mean, speed_uncertainty, speed_jitter = get_perf_timing( + batch_size, step_train_times, params.display_perf_ewma) + log_str = '%i\t%s\t%.*f' % ( + step + 1, + get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter), + LOSS_AND_ACCURACY_DIGITS_TO_SHOW, lossval) + if 'top_1_accuracy' in results: + log_str += '\t%.*f\t%.*f' % ( + LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_1_accuracy'], + LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_5_accuracy']) + log_fn(log_str) + if benchmark_logger: + benchmark_logger.log_metric( + 'current_examples_per_sec', speed_mean, global_step=step + 1) + if 'top_1_accuracy' in results: + benchmark_logger.log_metric( + 'top_1_accuracy', results['top_1_accuracy'], global_step=step + 1) + benchmark_logger.log_metric( + 'top_5_accuracy', results['top_5_accuracy'], global_step=step + 1) + if need_options_and_metadata: + if should_profile: + profiler.add_step(step, run_metadata) + if trace_filename and step == -2: + log_fn('Dumping trace to %s' % trace_filename) + trace_dir = os.path.dirname(trace_filename) + if not gfile.Exists(trace_dir): + gfile.MakeDirs(trace_dir) + with gfile.Open(trace_filename, 'w') as trace_file: + if params.use_chrome_trace_format: + trace = timeline.Timeline(step_stats=run_metadata.step_stats) + trace_file.write(trace.generate_chrome_trace_format(show_memory=True)) + else: + trace_file.write(str(run_metadata.step_stats)) + if partitioned_graph_file_prefix and step == -2: + path, filename = os.path.split(partitioned_graph_file_prefix) + if '.' in filename: + base_filename, ext = filename.rsplit('.', 1) + ext = '.' + ext + else: + base_filename, ext = filename, '' + as_text = filename.endswith('txt') + for graph_def in run_metadata.partition_graphs: + device = graph_def.node[0].device.replace('/', '_').replace(':', '_') + graph_filename = '%s%s%s' % (base_filename, device, ext) + log_fn('Writing partitioned GraphDef as %s to %s' % ( + 'text' if as_text else 'binary', + os.path.join(path, graph_filename))) + tf.train.write_graph(graph_def, path, graph_filename, as_text) + return (summary_str, lossval) + + +def get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter, scale=1): + if scale == 1: + # TODO(laigd): rename 'images' to maybe 'inputs', same below. + return ('images/sec: %.1f +/- %.1f (jitter = %.1f)' % + (speed_mean, speed_uncertainty, speed_jitter)) + else: + return 'images/sec: %.1f' % speed_mean + + +def get_perf_timing(batch_size, step_train_times, ewma_alpha=None, scale=1): + """Calculate benchmark processing speed.""" + times = np.array(step_train_times) + speeds = batch_size / times + if ewma_alpha: + weights = np.logspace(len(times)-1, 0, len(times), base=1-ewma_alpha) + time_mean = np.average(times, weights=weights) + else: + time_mean = np.mean(times) + speed_mean = scale * batch_size / time_mean + speed_uncertainty = np.std(speeds) / np.sqrt(float(len(speeds))) + speed_jitter = 1.4826 * np.median(np.abs(speeds - np.median(speeds))) + return speed_mean, speed_uncertainty, speed_jitter + + +def load_checkpoint(saver, sess, ckpt_dir): + """Loads checkpoint from provided directory or full path. + + Args: + saver: Saver used to restore the checkpoint. + sess: TensorFlow session. + ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint. + + Returns: + Global step. + """ + model_checkpoint_path = _get_checkpoint_to_load(ckpt_dir) + global_step = model_checkpoint_path.split('/')[-1].split('-')[-1] + if not global_step.isdigit(): + global_step = 0 + else: + global_step = int(global_step) + saver.restore(sess, model_checkpoint_path) + log_fn('Successfully loaded model from %s.' % model_checkpoint_path) + return global_step + + +def _get_checkpoint_to_load(ckpt_dir): + """Returns which checkpoint to load. + + Args: + ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint. + + Returns: + Full path to checkpoint to load. + + Raises: + CheckpointNotFoundException: If checkpoint is not found. + """ + p = re.compile(r'ckpt-\d+$') + if p.search(ckpt_dir): + model_checkpoint_path = ckpt_dir + else: + # Finds latest checkpoint in directory provided + ckpt = tf.train.get_checkpoint_state(ckpt_dir) + if ckpt and ckpt.model_checkpoint_path: + model_checkpoint_path = ckpt.model_checkpoint_path + else: + raise CheckpointNotFoundException('No checkpoint file found in dir:{}'. + format(ckpt_dir)) + return model_checkpoint_path + + +# Params are passed to BenchmarkCNN's constructor. Params is a map from name +# to value, with one field per key in flags.param_specs. +# +# Call make_params() or make_params_from_flags() below to construct a Params +# tuple with default values from flags.param_specs, rather than constructing +# Params directly. +Params = namedtuple('Params', flags.param_specs.keys()) # pylint: disable=invalid-name + + +def validate_params(params): + """Validates that the Params tuple had valid values. + + When command-line flags are defined for each ParamSpec by calling + flags.define_flags(), calling this function is unnecessary because absl + already does flag validation. Otherwise, this function should be called. + + Args: + params: A Params tuple. + Raises: + ValueError: An element of params had an invalid value. + """ + for name, value in params._asdict().items(): + param_spec = flags.param_specs[name] + if param_spec.flag_type in ('integer', 'float'): + if (value is not None and param_spec.kwargs['lower_bound'] is not None and + value < param_spec.kwargs['lower_bound']): + raise ValueError('Param %s value of %s is lower than the lower bound ' + 'of %s' % + (name, value, param_spec.kwargs['lower_bound'])) + if (value is not None and param_spec.kwargs['upper_bound'] is not None and + param_spec.kwargs['upper_bound'] < value): + raise ValueError('Param %s value of %s is higher than the upper bound ' + 'of %s' % + (name, value, param_spec.kwargs['upper_bound'])) + elif (value is not None and param_spec.flag_type == 'enum' and + value not in param_spec.kwargs['enum_values']): + raise ValueError('Param %s of value %s is not in %s'% + (name, value, param_spec.kwargs['enum_values'])) + + +def make_params(**kwargs): + """Create a Params tuple for BenchmarkCNN from kwargs. + + Default values are filled in from flags.param_specs. + + Args: + **kwargs: kwarg values will override the default values. + Returns: + Params namedtuple for constructing BenchmarkCNN. + """ + # Create a (name: default_value) map from flags.param_specs. + default_kwargs = { + name: flags.param_specs[name].default_value + for name in flags.param_specs + } + params = Params(**default_kwargs)._replace(**kwargs) + validate_params(params) + return params + + +def make_params_from_flags(): + """Create a Params tuple for BenchmarkCNN from absl_flags.FLAGS. + + Returns: + Params namedtuple for constructing BenchmarkCNN. + """ + # Collect (name: value) pairs for absl_flags.FLAGS with matching names in + # flags.param_specs. + flag_values = {name: getattr(absl_flags.FLAGS, name) + for name in flags.param_specs.keys()} + return Params(**flag_values) + + +def remove_param_fields(params, fields_to_remove): + """Remove fields from a Params namedtuple.""" + params_dict = params._asdict() + for field in fields_to_remove: + assert field in params_dict, 'Invalid Params field: ' + field + params_dict = {k: v for k, v in params_dict.items() + if k not in fields_to_remove} + new_params_type = namedtuple('Params', params_dict.keys()) + return new_params_type(**params_dict) + + +def get_num_batches_and_epochs(params, batch_size, num_examples_per_epoch): + """Returns the number of batches and epochs to run for. + + Args: + params: Params tuple, typically created by make_params or + make_params_from_flags. + batch_size: The number of images per step. + num_examples_per_epoch: The number of images in a single epoch. + + Returns: + num_batches: The number of batches to run for. + num_epochs: The number of epochs to run for. This might be slightly + smaller than params.num_epochs if specified, because the number of batches + must be an integer. + + Raises: + ValueError: Invalid or unsupported params. + """ + if params.num_batches and params.num_epochs: + raise ValueError('At most one of --num_batches and --num_epochs may be ' + 'specified.') + if params.num_epochs: + num_batches = int(params.num_epochs * num_examples_per_epoch + + batch_size - 1) // batch_size + else: + num_batches = params.num_batches or _DEFAULT_NUM_BATCHES + num_epochs = num_batches * batch_size / num_examples_per_epoch + return (num_batches, num_epochs) + + +def get_piecewise_learning_rate(piecewise_learning_rate_schedule, + global_step, num_batches_per_epoch): + """Returns a piecewise learning rate tensor. + + Args: + piecewise_learning_rate_schedule: The --piecewise_learning_rate_schedule + parameter + global_step: Scalar tensor representing the global step. + num_batches_per_epoch: float indicating the number of batches per epoch. + + Returns: + A scalar float tensor, representing the learning rate. + + Raises: + ValueError: piecewise_learning_rate_schedule is not formatted correctly. + """ + pieces = piecewise_learning_rate_schedule.split(';') + if len(pieces) % 2 == 0: + raise ValueError('--piecewise_learning_rate_schedule must have an odd ' + 'number of components') + values = [] + boundaries = [] + for i, piece in enumerate(pieces): + if i % 2 == 0: + try: + values.append(float(piece)) + except ValueError: + raise ValueError('Invalid learning rate: ' + piece) + else: + try: + boundaries.append(int(int(piece) * num_batches_per_epoch) - 1) + except ValueError: + raise ValueError('Invalid epoch: ' + piece) + return tf.train.piecewise_constant(global_step, boundaries, values, + name='piecewise_learning_rate') + + +def get_learning_rate(params, global_step, num_examples_per_epoch, model, + batch_size): + """Returns a learning rate tensor based on global_step. + + Args: + params: Params tuple, typically created by make_params or + make_params_from_flags. + global_step: Scalar tensor representing the global step. + num_examples_per_epoch: The number of examples per epoch. + model: The model.Model object to obtain the default learning rate from if no + learning rate is specified. + batch_size: Number of examples per step + + Returns: + A scalar float tensor, representing the learning rate. When evaluated, the + learning rate depends on the current value of global_step. + + Raises: + ValueError: Invalid or unsupported params. + """ + with tf.name_scope('learning_rate'): + num_batches_per_epoch = num_examples_per_epoch / batch_size + + if params.piecewise_learning_rate_schedule: + if (params.init_learning_rate is not None or + params.learning_rate_decay_factor or + params.minimum_learning_rate or params.num_epochs_per_decay): + raise ValueError('No other learning rate-related flags can be ' + 'specified if --piecewise_learning_rate_schedule is ' + 'specified') + learning_rate = get_piecewise_learning_rate( + params.piecewise_learning_rate_schedule, + global_step, num_batches_per_epoch) + elif params.init_learning_rate is not None: + learning_rate = params.init_learning_rate + if (params.num_epochs_per_decay > 0 and + params.learning_rate_decay_factor > 0): + decay_steps = int(num_batches_per_epoch * params.num_epochs_per_decay) + + # Decay the learning rate exponentially based on the number of steps. + learning_rate = tf.train.exponential_decay( + params.init_learning_rate, + global_step, + decay_steps, + params.learning_rate_decay_factor, + staircase=True) + + if params.minimum_learning_rate != 0.: + learning_rate = tf.maximum(learning_rate, + params.minimum_learning_rate) + else: + learning_rate = model.get_learning_rate(global_step, batch_size) + if params.num_learning_rate_warmup_epochs > 0 and ( + params.init_learning_rate is not None or + params.piecewise_learning_rate_schedule): + warmup_steps = int(num_batches_per_epoch * + params.num_learning_rate_warmup_epochs) + init_lr = params.init_learning_rate + if init_lr is None: + init_lr = float(params.piecewise_learning_rate_schedule.split(';')[0]) + warmup_lr = init_lr * tf.cast(global_step, tf.float32) / tf.cast( + warmup_steps, tf.float32) + learning_rate = tf.cond(global_step < warmup_steps, + lambda: warmup_lr, lambda: learning_rate) + + learning_rate = mlperf.logger.log_deferred_tensor_value( + mlperf.tags.OPT_LR, learning_rate, global_step, every_n=100) + return learning_rate + + +def get_optimizer(params, learning_rate): + """Returns the optimizer that should be used based on params.""" + if params.optimizer == 'momentum': + mlperf.logger.log(key=mlperf.tags.OPT_NAME, + value=mlperf.tags.SGD_WITH_MOMENTUM) + mlperf.logger.log(key=mlperf.tags.OPT_MOMENTUM, value=params.momentum) + opt = tf.train.MomentumOptimizer( + learning_rate, params.momentum, use_nesterov=True) + elif params.optimizer == 'sgd': + mlperf.logger.log(key=mlperf.tags.OPT_NAME, value=mlperf.tags.SGD) + opt = tf.train.GradientDescentOptimizer(learning_rate) + elif params.optimizer == 'rmsprop': + opt = tf.train.RMSPropOptimizer( + learning_rate, + params.rmsprop_decay, + momentum=params.rmsprop_momentum, + epsilon=params.rmsprop_epsilon) + elif params.optimizer == 'adam': + opt = tf.train.AdamOptimizer(learning_rate, params.adam_beta1, + params.adam_beta2, params.adam_epsilon) + else: + raise ValueError('Optimizer "{}" was not recognized'. + format(params.optimizer)) + return opt + + +def generate_tfprof_profile(profiler, tfprof_file): + """Generates a tfprof profile, writing it to a file and printing top ops. + + Args: + profiler: A tf.profiler.Profiler. `profiler.add_step` must have already been + called. + tfprof_file: The filename to write the ProfileProto to. + """ + profile_proto = profiler.serialize_to_string() + log_fn('Dumping ProfileProto to %s' % tfprof_file) + with gfile.Open(tfprof_file, 'wb') as f: + f.write(profile_proto) + + # Print out the execution times of the top operations. Note this + # information can also be obtained with the dumped ProfileProto, but + # printing it means tfprof doesn't have to be used if all the user wants + # is the top ops. + options = tf.profiler.ProfileOptionBuilder.time_and_memory() + options['max_depth'] = _NUM_OPS_TO_PRINT + options['order_by'] = 'accelerator_micros' + profiler.profile_operations(options) + + +class BenchmarkCNN(object): + """Class for benchmarking a cnn network.""" + + def __init__(self, params, dataset=None, model=None): + """Initialize BenchmarkCNN. + + Args: + params: Params tuple, typically created by make_params or + make_params_from_flags. + dataset: If not None, the dataset to use. Otherwise, params is used to + obtain the dataset. + model: If not None, the model to use. Otherwise, params is used to obtain + the model. + Raises: + ValueError: Unsupported params settings. + """ + mlperf.logger.log(key=mlperf.tags.RUN_START) + self.params = params + if params.eval: + self._doing_eval = True + else: + # Note self._doing_eval can later switch to True in self._do_eval() if + # self.params.eval_during_training_* is specified. + self._doing_eval = False + self.dataset = dataset or datasets.create_dataset(self.params.data_dir, + self.params.data_name) + self.model = model or model_config.get_model_config( + self.params.model, self.dataset, self.params) + self.trace_filename = self.params.trace_file + self.rewriter_config = self.params.rewriter_config + autotune_threshold = self.params.autotune_threshold if ( + self.params.autotune_threshold) else 1 + min_autotune_warmup = 5 * autotune_threshold * autotune_threshold + self.num_warmup_batches = self.params.num_warmup_batches if ( + self.params.num_warmup_batches is not None) else max( + 10, min_autotune_warmup) + self.graph_file = self.params.graph_file + self.resize_method = self.params.resize_method + self.sync_queue_counter = 0 + self.num_gpus = self.params.num_gpus + if self.params.gpu_indices: + self.gpu_indices = [int(x) for x in self.params.gpu_indices.split(',')] + else: + self.gpu_indices = [x for x in range(self.num_gpus)] + + if (self.params.device == 'cpu' and self.params.data_format == 'NCHW' and + not self.params.mkl): + raise ValueError('device=cpu requires that data_format=NHWC') + + if ((self.params.num_epochs_per_decay or + self.params.learning_rate_decay_factor) and + not (self.params.init_learning_rate is not None and + self.params.num_epochs_per_decay + and self.params.learning_rate_decay_factor)): + raise ValueError('If one of num_epochs_per_decay or ' + 'learning_rate_decay_factor is set, both must be set' + 'and learning_rate must be set') + if (self.params.minimum_learning_rate and + not (self.params.init_learning_rate is not None and + self.params.num_epochs_per_decay and + self.params.learning_rate_decay_factor)): + raise ValueError('minimum_learning_rate requires learning_rate,' + 'num_epochs_per_decay, and ' + 'learning_rate_decay_factor to be set') + + if (self.params.use_fp16 and self.params.fp16_vars and + 'replicated' in self.params.variable_update and + self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec): + raise ValueError('fp16 variables are not supported with NCCL') + if (self.params.use_fp16 and self.params.fp16_vars and + self.params.gradient_repacking): + raise ValueError('--fp16_vars cannot be used with --gradient_repacking') + + if self.params.variable_update == 'horovod' and self.params.num_gpus > 1: + raise ValueError('Horovod benchmarks require num_gpus=1 on each worker') + + if self.params.variable_update == 'horovod' and self.params.job_name: + raise ValueError('job_name should not be specified for Horovod.') + + if self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale: + if self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec: + raise ValueError('Automatic loss scaling is not supported with NCCL.') + if self.params.variable_update not in ('parameter_server', 'replicated', + 'independent'): + raise ValueError('Automatic loss scaling is not supported with ' + 'variable_update=%s.' % self.params.variable_update) + if self.params.staged_vars: + raise ValueError('Automatic loss scaling is not supported with' + 'staged_vars.') + + if (self.params.debugger is not None and self.params.debugger != 'cli' and + ':' not in self.params.debugger): + raise ValueError('--debugger must be "cli" or in the form ' + 'host:port') + + if self.params.hierarchical_copy and self.params.num_gpus <= 1: + raise ValueError('--hierarchical_copy requires --num_gpus to be greater ' + 'than 1') + + if params.save_model_secs and params.save_model_steps: + raise ValueError('At most one of --save_model_secs and ' + '--save_model_steps can be specified') + + eval_during_training_flags = list(map(bool, [ + params.eval_during_training_every_n_steps, + params.eval_during_training_every_n_epochs, + params.eval_during_training_at_specified_steps, + params.eval_during_training_at_specified_epochs, + ])) + + if eval_during_training_flags.count(True) > 1: + raise ValueError('At most one flag with --eval_during_training_* prefix ' + 'must be specified.') + + eval_during_training_enabled = any(eval_during_training_flags) + + if eval_during_training_enabled: + if params.eval: + raise ValueError('At most one of --eval and --eval_during_training_* ' + 'must be specified') + if params.forward_only: + raise ValueError('At most one of --forward_only and ' + '--eval_during_training_* must be specified') + if params.job_name: + raise ValueError('--eval_during_training_* is not yet supported in ' + 'distributed mode.') + if params.staged_vars: + raise ValueError('--eval_during_training_* is not currently compatible ' + 'with --staged_vars') + + if params.stop_at_top_1_accuracy and not eval_during_training_enabled: + raise ValueError('--stop_at_top_1_accuracy is only supported with ' + '--eval_during_training_*') + if params.collect_eval_results_async and params.model != 'ssd300': + raise ValueError('--collect_eval_results_async only works with ssd300 ' + 'model currently.') + if self.params.forward_only and self.params.freeze_when_forward_only: + if self.params.train_dir is not None: + raise ValueError('In forward_only mode, when --freeze_when_forward_only' + ' is True, --train_dir should not be specified') + if self.params.data_dir and not self.params.datasets_use_prefetch: + raise ValueError('In forward_only mode, when --freeze_when_forward_only' + ' is True and --data_dir is set, ' + '--datasets_use_prefetch should be set to True') + if self.params.job_name: + raise ValueError('In forward_only mode, when --freeze_when_forward_only' + ' is True, --job_name should not be specified and ' + 'distributed running is not supported') + self.forward_only_and_freeze = True + else: + self.forward_only_and_freeze = False + if self.params.trt_mode: + raise ValueError('--trt_mode should not be specified if one of ' + '--forward_only and --freeze_when_forward_only is set ' + 'to False') + + self.mode = get_mode_from_params(self.params) + + # Use the batch size from the command line if specified, otherwise use the + # model's default batch size. Scale the benchmark's batch size by the + # number of GPUs. + if self.params.batch_size > 0: + self.model.set_batch_size(self.params.batch_size) + self.batch_size = self.model.get_batch_size() * self.num_gpus + if self.mode in (constants.BenchmarkMode.TRAIN, + constants.BenchmarkMode.TRAIN_AND_EVAL): + self.train_batch_size = self.batch_size + else: + self.train_batch_size = None + if self.mode in (constants.BenchmarkMode.EVAL, + constants.BenchmarkMode.TRAIN_AND_EVAL): + if self.params.eval_batch_size > 0: + self.eval_batch_size = self.params.eval_batch_size * self.num_gpus + else: + self.eval_batch_size = self.batch_size + else: + self.eval_batch_size = None + self.batch_group_size = self.params.batch_group_size + self.enable_auto_loss_scale = ( + self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale) + self.loss_scale = None + self.loss_scale_normal_steps = None + + self.job_name = self.params.job_name # "" for local training + + # PS server is used for distributed jobs not using all-reduce. + use_ps_server = self.job_name and (self.params.variable_update != + 'distributed_all_reduce' and + self.params.variable_update != + 'collective_all_reduce') + # controller is used for distributed_all_reduce with > 1 worker. + use_controller = ( + self.params.variable_update == 'distributed_all_reduce' and + self.job_name) + if use_controller and not params.controller_host: + raise ValueError('When variable_update==distributed_all_reduce ' + 'controller_host must also be specified.') + # collective_all_reduce doesn't need a controller or ps + self.distributed_collective = ( + self.params.variable_update == 'collective_all_reduce' and + self.job_name) + + self.local_parameter_device_flag = self.params.local_parameter_device + if self.job_name: + self.task_index = self.params.task_index + self.cluster_manager = platforms_util.get_cluster_manager( + params, create_config_proto(params)) + assert isinstance(self.cluster_manager, cnn_util.BaseClusterManager) + + worker_prefix = '/job:worker/replica:0/task:%s' % self.task_index + if use_ps_server: + self.param_server_device = tf.train.replica_device_setter( + worker_device=worker_prefix + '/cpu:0', + cluster=self.cluster_manager.get_cluster_spec()) + # This device on which the queues for managing synchronization between + # servers should be stored. + self.sync_queue_devices = [ + '/job:ps/replica:0/task:%s/cpu:0' % i + for i in range(self.cluster_manager.num_ps()) + ] + else: + self.sync_queue_devices = ['/job:worker/replica:0/task:0/cpu:0'] + else: + self.task_index = 0 + self.cluster_manager = None + worker_prefix = '' + self.param_server_device = '/%s:0' % self.params.local_parameter_device + self.sync_queue_devices = [self.param_server_device] + + if self.cluster_manager: + self.num_workers = self.cluster_manager.num_workers() + elif self.params.variable_update == 'horovod': + import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top + self.num_workers = hvd.size() + else: + self.num_workers = 1 + self.num_ps = self.cluster_manager.num_ps() if self.cluster_manager else 0 + + if self.num_workers > 1 and self.params.all_reduce_spec == 'nccl': + raise ValueError('--all_reduce_spec=nccl is invalid in a ' + 'multi-worker job') + + # Device to use for ops that need to always run on the local worker's CPU. + self.cpu_device = '%s/cpu:0' % worker_prefix + + # Device to use for ops that need to always run on the local worker's + # compute device, and never on a parameter server device. + self.raw_devices = [ + '%s/%s:%i' % (worker_prefix, self.params.device, i) + for i in xrange(self.num_gpus) + ] + + subset = 'validation' if params.eval else 'train' + self.num_batches, self.num_epochs = get_num_batches_and_epochs( + params, self.batch_size * self.num_workers, + self.dataset.num_examples_per_epoch(subset)) + if self.mode in (constants.BenchmarkMode.EVAL, + constants.BenchmarkMode.TRAIN_AND_EVAL): + # TODO(reedwm): Currently we do extra eval logic for num_eval_batches and + # the preprocessor. We should encapsulate this logic into a shared + # function or class. + if params.num_eval_batches is None and params.num_eval_epochs is None: + eval_params = self.params + else: + eval_params = self.params._replace( + num_batches=self.params.num_eval_batches, + num_epochs=self.params.num_eval_epochs) + self.num_eval_batches, self.num_eval_epochs = get_num_batches_and_epochs( + eval_params, self.eval_batch_size * self.num_workers, + self.dataset.num_examples_per_epoch('validation')) + else: + self.num_eval_batches, self.num_eval_epochs = None, None + + num_train_examples_per_epoch = self.dataset.num_examples_per_epoch('train') + if self.params.eval_during_training_every_n_epochs: + n_epochs = self.params.eval_during_training_every_n_epochs + self.eval_during_training_at_specified_steps = { + (int(e * num_train_examples_per_epoch + self.batch_size - 1) // + self.batch_size) + for e in np.arange(n_epochs, self.num_epochs, n_epochs)} + + if self.params.eval_during_training_at_specified_steps: + try: + self.eval_during_training_at_specified_steps = set(map( + int, self.params.eval_during_training_at_specified_steps)) + except ValueError: + raise ValueError('Param eval_during_training_at_specified_steps value ' + 'of %s cannot be converted to a list of integers.' % + (self.params.eval_during_training_at_specified_steps)) + + if self.params.eval_during_training_at_specified_epochs: + try: + n_epochs = list(map( + float, self.params.eval_during_training_at_specified_epochs)) + offset = n_epochs[0] - 1 + if offset.is_integer(): + offset = int(offset) + mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset) + self.eval_during_training_at_specified_steps = { + (int(e * num_train_examples_per_epoch + self.batch_size - 1) // + self.batch_size) + for e in n_epochs} + except ValueError: + raise ValueError('Param eval_during_training_at_specified_epochs value ' + 'of %s cannot be converted to a list of floats.' % + (self.params.eval_during_training_at_specified_epochs)) + + if params.eval_during_training_every_n_epochs: + offset = params.eval_during_training_every_n_epochs - 1 + if offset.is_integer(): + offset = int(offset) + mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset) + + if (self.params.staged_vars and + self.params.variable_update != 'parameter_server'): + raise ValueError('staged_vars for now is only supported with ' + 'variable_update=parameter_server') + + if self.params.variable_update == 'parameter_server': + if self.job_name: + if not self.params.staged_vars: + self.variable_mgr = variable_mgr.VariableMgrDistributedFetchFromPS( + self) + else: + self.variable_mgr = ( + variable_mgr.VariableMgrDistributedFetchFromStagedPS(self)) + else: + if not self.params.staged_vars: + self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(self) + else: + self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromStagedPS( + self) + elif self.params.variable_update == 'replicated': + if self.job_name: + raise ValueError('Invalid variable_update in distributed mode: %s' % + self.params.variable_update) + self.variable_mgr = variable_mgr.VariableMgrLocalReplicated( + self, self.params.all_reduce_spec, + self.params.agg_small_grads_max_bytes, + self.params.agg_small_grads_max_group, + self.params.allreduce_merge_scope) + elif self.params.variable_update == 'distributed_all_reduce': + assert self.params.cross_replica_sync + self.variable_mgr = variable_mgr.VariableMgrDistributedAllReduce( + self, self.params.all_reduce_spec, + ('worker' if self.num_workers > 1 else 'localhost'), + self.num_workers, self.params.agg_small_grads_max_bytes, + self.params.agg_small_grads_max_group, + self.params.allreduce_merge_scope) + elif self.params.variable_update == 'collective_all_reduce': + assert self.params.cross_replica_sync + self.variable_mgr = variable_mgr.VariableMgrCollectiveAllReduce( + self, self.params.all_reduce_spec, + self.num_workers, self.num_gpus, self.task_index, + self.params.allreduce_merge_scope) + elif self.params.variable_update == 'distributed_replicated': + assert self.params.cross_replica_sync + if not self.job_name: + raise ValueError('Invalid variable_update in local mode: %s' % + self.params.variable_update) + self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated(self) + elif self.params.variable_update in ('independent', 'horovod'): + if self.job_name: + raise ValueError('Invalid variable_update in distributed mode: %s' % + self.params.variable_update) + self.variable_mgr = variable_mgr.VariableMgrIndependent(self) + else: + raise ValueError( + 'Invalid variable_update: %s' % self.params.variable_update) + + # Device to use for running on the local worker's compute device, but + # with variables assigned to parameter server devices. + self.devices = self.variable_mgr.get_devices() + if self.job_name: + if use_ps_server: + self.global_step_device = self.param_server_device + elif self.params.variable_update == 'collective_all_reduce': + self.global_step_device = self.cpu_device + else: + self.global_step_device = '/job:worker/replica:0/task:0/cpu:0' + else: + self.global_step_device = self.cpu_device + + self.input_preprocessor = None + self.eval_input_preprocessor = None + if not self.dataset.use_synthetic_gpu_inputs(): + if not self.params.eval: + self.input_preprocessor = self.get_input_preprocessor() + if self.mode in (constants.BenchmarkMode.EVAL, + constants.BenchmarkMode.TRAIN_AND_EVAL): + with self._do_eval(): + self.eval_input_preprocessor = self.get_input_preprocessor() + self.datasets_use_prefetch = ( + self.params.datasets_use_prefetch and + # TODO(rohanj): Figure out why --datasets_use_prefetch freezes on the + # CPU. + self.params.device.lower() != 'cpu' and + self.input_preprocessor and + self.input_preprocessor.supports_datasets()) + self.init_global_step = 0 + + self._config_benchmark_logger() + + if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL: + # Remove "eval" from params so it is not accidentally used. Since eval can + # still occur despite params.eval being False, params.eval should never + # be used. We cannot yet remove this unconditionally, because the SSD + # model still uses params.eval, and hence does not work properly with + # --eval_during_training_*. + # TODO(b/116627045): We should also remove fields that have an eval + # equivalent, like num_batches and num_eval_batches. + self.params = remove_param_fields(self.params, {'eval'}) + + @contextlib.contextmanager + def _do_eval(self): + """Context manager to switches BenchmarkCNN to eval mode. + + Any evaluation code should be put under this context manager. This context + manager switches self._doing_eval to True. It also switches certain + attributes, like self.num_batches and self.num_epochs, to be the number of + batches and epochs for evaluation respectively + + Yields: + Nothing. + """ + # TODO(b/116627045): Find a more general way of switching attributes to the + # eval equivalents. + old_doing_eval = self._doing_eval + old_num_batches = self.num_batches + old_num_epochs = self.num_epochs + old_batch_size = self.batch_size + try: + self._doing_eval = True + self.num_batches = self.num_eval_batches + self.num_epochs = self.num_eval_epochs + self.batch_size = self.eval_batch_size + self.model.set_batch_size(self.eval_batch_size // self.num_gpus) + yield + finally: + self._doing_eval = old_doing_eval + self.num_batches = old_num_batches + self.num_epochs = old_num_epochs + self.batch_size = old_batch_size + self.model.set_batch_size(old_batch_size // self.num_gpus) + + def _config_benchmark_logger(self): + """Config the model garden benchmark logger.""" + model_benchmark_logger = None + if self.params.benchmark_log_dir is not None: + try: + from official.utils.logs import logger as models_logger # pylint: disable=g-import-not-at-top + except ImportError: + tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH ' + 'in order to use BenchmarkLogger. Configured ' + 'benchmark_log_dir: %s' + % self.params.benchmark_log_dir) + raise + model_benchmark_logger = models_logger.BenchmarkFileLogger( + self.params.benchmark_log_dir) + self.benchmark_logger = model_benchmark_logger + + # TODO(laigd): this changes the global device list which is used everywhere, + # consider refactoring it. + def reset_devices_for_task(self, task_num, is_local=False): + """Used to imitate another task when building a distributed graph.""" + worker_prefix = ('/job:localhost' if is_local else + '/job:worker/replica:0/task:%s' % task_num) + self.cpu_device = '%s/cpu:0' % worker_prefix + self.raw_devices = [ + '%s/%s:%i' % (worker_prefix, self.params.device, i) + for i in xrange(self.num_gpus) + ] + self.devices = self.variable_mgr.get_devices() + + def raw_devices_across_tasks(self, is_local=False): + """Returns list of raw device names across all tasks.""" + if is_local: + assert self.num_workers == 1 + return self.raw_devices + else: + return [ + 'job:worker/replica:0/task%s/%s:%i' % (t, self.params.device, i) + for t in xrange(self.num_workers) + for i in xrange(self.num_gpus) + ] + + def print_info(self): + """Print basic information.""" + benchmark_info = self._get_params_info() + log_fn('Model: %s' % self.model.get_model_name()) + log_fn('Dataset: %s' % benchmark_info['dataset_name']) + log_fn('Mode: %s' % self.mode) + log_fn('SingleSess: %s' % benchmark_info['single_session']) + log_fn('Batch size: %s global' % (self.batch_size * self.num_workers)) + log_fn(' %s per device' % (self.batch_size // + len(self.raw_devices))) + if self.batch_group_size > 1: + log_fn(' %d batches per prepocessing group' % + self.batch_group_size) + log_fn('Num batches: %d' % self.num_batches) + log_fn('Num epochs: %.2f' % self.num_epochs) + log_fn('Devices: %s' % benchmark_info['device_list']) + log_fn('NUMA bind: %s' % self.params.use_numa_affinity) + log_fn('Data format: %s' % self.params.data_format) + if self.rewriter_config: + log_fn('RewriterConfig: %s' % self.rewriter_config) + log_fn('Optimizer: %s' % self.params.optimizer) + log_fn('Variables: %s' % self.params.variable_update) + if (self.params.variable_update == 'replicated' or + self.params.variable_update == 'distributed_all_reduce' + or self.params.variable_update == 'collective_all_reduce'): + log_fn('AllReduce: %s' % self.params.all_reduce_spec) + if self.job_name: + log_fn('Sync: %s' % self.params.cross_replica_sync) + if self.params.staged_vars: + log_fn('Staged vars: %s' % self.params.staged_vars) + if self.params.variable_update == 'horovod' and self.params.horovod_device: + log_fn('Horovod on: %s' % self.params.horovod_device) + log_fn('==========') + + def _get_params_info(self): + """Get the common parameters info for the benchmark run. + + Returns: + A dict of processed parameters. + """ + dataset_name = self.dataset.name + if self.dataset.use_synthetic_gpu_inputs(): + dataset_name += ' (synthetic)' + single_session = self.params.variable_update == 'distributed_all_reduce' + if single_session: + device_list = self.raw_devices_across_tasks() + elif self.params.variable_update == 'horovod': + device_list = ['horovod/%s:%d' % (self.params.device, idx) + for idx in range(self.num_workers)] + else: + device_list = self.raw_devices + return { + 'dataset_name': dataset_name, + 'single_session': single_session, + 'device_list': device_list,} + + def _log_benchmark_run(self): + """Log the benchmark info to the logger. + + The info logged here should be similar to print_info(), but in a structured + JSON format. + """ + if self.benchmark_logger: + benchmark_info = self._get_params_info() + + run_param = { + 'model': self.model.get_model_name(), + 'dataset': benchmark_info['dataset_name'], + 'mode': self.mode, + 'single_sess': benchmark_info['single_session'], + 'devices': benchmark_info['device_list'], + 'batch_size': self.batch_size, + 'batch_size_per_device': self.batch_size // len(self.raw_devices), + 'num_batches': self.num_batches, + 'num_epochs': self.num_epochs, + 'data_format': self.params.data_format, + 'rewrite_config': self.rewriter_config, + 'optimizer': self.params.optimizer, + 'session_config': create_config_proto(self.params), + } + # TODO(scottzhu): tf_cnn_benchmark might execute several times with + # different param setting on the same box. This will cause the run file to + # only contain the latest info. The benchmark_log_dir should be updated + # for every new run. + self.benchmark_logger.log_run_info( + self.model.get_model_name(), benchmark_info['dataset_name'], + run_param, test_id=self.params.benchmark_test_id) + + def run(self): + """Run the benchmark task assigned to this process. + + Returns: + Dictionary of statistics for training or eval. + Raises: + ValueError: unrecognized job name. + """ + if self.params.job_name == 'ps': + log_fn('Running parameter server %s' % self.task_index) + self.cluster_manager.join_server() + return {} + + # For distributed_all_reduce with multiple workers, drive + # from a separate controller process. + if self.params.variable_update == 'distributed_all_reduce': + if self.params.job_name == 'worker': + log_fn('Starting worker %s' % self.task_index) + self.cluster_manager.join_server() + return + elif self.params.job_name and self.params.job_name != 'controller': + raise ValueError('unrecognized job name: %s' % self.params.job_name) + + self._log_benchmark_run() + if self._doing_eval: + with tf.Graph().as_default(): + # TODO(laigd): freeze the graph in eval mode. + return self._run_eval() + else: + return self._benchmark_train() + + def _run_eval(self): + """Evaluate a model every self.params.eval_interval_secs. + + Returns: + Dictionary containing eval statistics. Currently returns an empty + dictionary. + + Raises: + ValueError: If self.params.train_dir is unspecified. + """ + if self.params.train_dir is None: + raise ValueError('Trained model directory not specified') + graph_info = self._build_eval_graph() + saver = tf.train.Saver(self.variable_mgr.savable_variables()) + summary_writer = tf.summary.FileWriter(self.params.eval_dir, + tf.get_default_graph()) + target = '' + # TODO(huangyp): Check if checkpoints haven't updated for hours and abort. + while True: + with tf.Session( + target=target, config=create_config_proto(self.params)) as sess: + image_producer = None + try: + global_step = load_checkpoint(saver, sess, self.params.train_dir) + image_producer = self._initialize_eval_graph( + graph_info.enqueue_ops, graph_info.input_producer_op, + graph_info.local_var_init_op_group, sess) + except CheckpointNotFoundException: + log_fn('Checkpoint not found in %s' % self.params.train_dir) + else: # Only executes if an exception was not thrown + self._eval_once(sess, summary_writer, graph_info.fetches, + graph_info.summary_op, image_producer, global_step) + if image_producer is not None: + image_producer.done() + if self.params.eval_interval_secs <= 0: + break + time.sleep(self.params.eval_interval_secs) + return {} + + def _build_eval_graph(self, scope_name=None): + """Build the evaluation graph. + + Args: + scope_name: String to filter what summaries are collected. Only summary + ops whose name contains `scope_name` will be added, which is useful for + only including evaluation ops. + + Returns: + A GraphInfo named_tuple containing various useful ops and tensors of the + evaluation grpah. + """ + with self._do_eval(): + input_producer_op, enqueue_ops, fetches = self._build_model() + local_var_init_op = tf.local_variables_initializer() + table_init_ops = tf.tables_initializer() + variable_mgr_init_ops = [local_var_init_op] + if table_init_ops: + variable_mgr_init_ops.extend([table_init_ops]) + with tf.control_dependencies([local_var_init_op]): + variable_mgr_init_ops.extend(self.variable_mgr.get_post_init_ops()) + local_var_init_op_group = tf.group(*variable_mgr_init_ops) + + summary_op = tf.summary.merge_all(scope=scope_name) + # The eval graph has no execution barrier because it doesn't run in + # distributed mode. + execution_barrier = None + # We do not use the global step during evaluation. + global_step = None + return GraphInfo(input_producer_op, enqueue_ops, fetches, + execution_barrier, global_step, local_var_init_op_group, + summary_op) + + # TODO(reedwm): For consistency, we should have a similar + # "_initialize_train_graph" function. They can likely be the same function. + def _initialize_eval_graph(self, enqueue_ops, input_producer_op, + local_var_init_op_group, sess): + """Initializes the evaluation graph. + + Args: + enqueue_ops: Ops that adds the preprocessed images to the staging areas. + input_producer_op: Op that produce the input batches (before + preprocessing). + local_var_init_op_group: Group of ops that perform per-device + initialization work. + sess: The session to initialize the eval graph with. + + Returns: + An ImageProducer, or None if an ImageProducer isn't being used. + """ + with self._do_eval(): + if local_var_init_op_group is not None: + # We might reinitialize local variables if they were already initialized + # during training. This is OK. + sess.run(local_var_init_op_group) + if self.dataset.queue_runner_required(): + tf.train.start_queue_runners(sess=sess) + image_producer = None + if input_producer_op is not None: + image_producer = cnn_util.ImageProducer( + sess, input_producer_op, self.batch_group_size, + self.params.use_python32_barrier) + image_producer.start() + if enqueue_ops: + for i in xrange(len(enqueue_ops)): + sess.run(enqueue_ops[:(i + 1)]) + if image_producer is not None: + image_producer.notify_image_consumption() + return image_producer + + def _eval_once(self, sess, summary_writer, fetches, summary_op, + image_producer, global_step): + """Evaluate the model using the validation dataset.""" + with self._do_eval(): + mlperf.logger.log_eval_epoch( + mlperf.tags.EVAL_START, global_step, self.batch_size) + loop_start_time = start_time = time.time() + # TODO(laigd): refactor the part to compute/report the accuracy. Currently + # it only works for image models. + top_1_accuracy_sum = 0.0 + top_5_accuracy_sum = 0.0 + total_eval_count = self.num_batches * self.batch_size + pred_classes = [] + for step in xrange(self.num_batches): + if (summary_writer and self.params.save_summaries_steps > 0 and + (step + 1) % self.params.save_summaries_steps == 0): + results, summary_str = sess.run([fetches, summary_op]) + summary_writer.add_summary(summary_str) + else: + results = sess.run(fetches) + # Make global_step available in results for postprocessing. + results['global_step'] = global_step + results = self.model.postprocess(results) + pred_classes.append(results['all_logits']) + top_1_accuracy_sum += results['top_1_accuracy'] + top_5_accuracy_sum += results['top_5_accuracy'] + if (step + 1) % self.params.display_every == 0: + duration = time.time() - start_time + examples_per_sec = ( + self.batch_size * self.params.display_every / duration) + log_fn('%i\t%.1f examples/sec' % (step + 1, examples_per_sec)) + start_time = time.time() + if image_producer is not None: + image_producer.notify_image_consumption() + pred_classes = np.squeeze(np.array(pred_classes)) + save_postfix = 'nv' if 'nv' in self.params.save_dir else 'bi' + np.save('{}/pred_classes_{}_{}.npy'.format(self.params.save_dir, self.params.model, save_postfix), pred_classes) + loop_end_time = time.time() + accuracy_at_1 = top_1_accuracy_sum / self.num_batches + accuracy_at_5 = top_5_accuracy_sum / self.num_batches + summary = tf.Summary() + summary.value.add(tag='eval/Accuracy@1', simple_value=accuracy_at_1) + summary.value.add(tag='eval/Accuracy@5', simple_value=accuracy_at_5) + for result_key, result_value in results.items(): + if result_key.startswith(constants.SIMPLE_VALUE_RESULT_PREFIX): + prefix_len = len(constants.SIMPLE_VALUE_RESULT_PREFIX) + summary.value.add(tag='eval/' + result_key[prefix_len:], + simple_value=result_value) + if summary_writer: + summary_writer.add_summary(summary, global_step) + log_fn('Accuracy @ 1 = %.4f Accuracy @ 5 = %.4f [%d examples]' % + (accuracy_at_1, accuracy_at_5, total_eval_count)) + elapsed_time = loop_end_time - loop_start_time + images_per_sec = (self.num_batches * self.batch_size / elapsed_time) + if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL: + # Note that we compute the top 1 accuracy and top 5 accuracy for each + # batch, which will have a slight performance impact. + log_fn('-' * 64) + log_fn('total images/sec: %.2f' % images_per_sec) + log_fn('-' * 64) + if self.benchmark_logger: + eval_result = { + 'eval_top_1_accuracy', accuracy_at_1, + 'eval_top_5_accuracy', accuracy_at_5, + 'eval_average_examples_per_sec', images_per_sec, + tf.GraphKeys.GLOBAL_STEP, global_step, + } + self.benchmark_logger.log_evaluation_result(eval_result) + mlperf.logger.log_eval_epoch( + mlperf.tags.EVAL_STOP, global_step, self.batch_size) + mlperf.logger.log(key=mlperf.tags.EVAL_SIZE, + value=self.num_batches * self.batch_size) + if self.params.model != 'ssd300': # ssd300 logs eval accuracy elsewhere. + mlperf.logger.log_eval_accuracy( + accuracy_at_1, global_step, self.train_batch_size, + examples_per_epoch=self.dataset.num_examples_per_epoch('train')) + if self.params.stop_at_top_1_accuracy: + mlperf.logger.log(key=mlperf.tags.EVAL_TARGET, + value=self.params.stop_at_top_1_accuracy) + return accuracy_at_1, accuracy_at_5 + + def _benchmark_train(self): + """Run cnn in benchmark mode. Skip the backward pass if forward_only is on. + + Returns: + Dictionary containing training statistics (num_workers, num_steps, + average_wall_time, images_per_sec). + """ + graph = tf.Graph() + with graph.as_default(): + build_result = self._build_graph() + if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL: + with self.variable_mgr.reuse_variables(): + with tf.name_scope('Evaluation') as ns: + eval_build_results = self._build_eval_graph(ns) + else: + eval_build_results = None + (graph, result_to_benchmark) = self._preprocess_graph(graph, build_result) + with graph.as_default(): + return self._benchmark_graph(result_to_benchmark, eval_build_results) + + GPU_CACHED_INPUT_VARIABLE_NAME = 'gpu_cached_inputs' + + def _unfreezable_local_variables(self, graph): + """Get the local variables that we don't want to freeze.""" + return graph.get_collection( + tf.GraphKeys.LOCAL_VARIABLES, + # We don't freeze the gpu_cached_images local variable so it won't get + # constant folded with ops which process the input. + scope='.*' + BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME) + + def _build_graph(self): + """Build the graph. + + Returns: + A namedtuple containing the ops/tensors that required by + _benchmark_graph(). + """ + if self.params.variable_update == 'distributed_all_reduce': + self.single_session = True + (input_producer_op, enqueue_ops, fetches) = ( + self._build_model_single_session()) + else: + self.single_session = False + (input_producer_op, enqueue_ops, fetches) = self._build_model() + fetches_list = nest.flatten(list(fetches.values())) + main_fetch_group = tf.group(*fetches_list, name='main_fetch_group') + execution_barrier = None + if (not self.single_session and self.job_name and + not self.params.cross_replica_sync): + execution_barrier = self.add_sync_queues_and_barrier( + 'execution_barrier_', []) + + global_step = tf.train.get_global_step() + with tf.device(self.global_step_device), tf.name_scope('inc_global_step'): + with tf.control_dependencies([main_fetch_group]): + fetches['inc_global_step'] = global_step.assign_add(1) + + if ((not self.single_session) and (not self.distributed_collective) and + self.job_name and self.params.cross_replica_sync): + # Block all replicas until all replicas are ready for next step. + fetches['sync_queues'] = self.add_sync_queues_and_barrier( + 'sync_queues_step_end_', [main_fetch_group]) + + # Skips the init ops for freezable local variables in forward_only mode so + # we can remove all the assign ops when converting variables to constants. + with tf.name_scope('local_variable_initialization'): + if self.forward_only_and_freeze: + local_var_init_op = tf.variables_initializer( + self._unfreezable_local_variables(tf.get_default_graph())) + else: + local_var_init_op = tf.local_variables_initializer() + table_init_ops = tf.tables_initializer() + + variable_manager_init_ops = [local_var_init_op] + if table_init_ops: + variable_manager_init_ops.extend([table_init_ops]) + if not self.forward_only_and_freeze: + with tf.control_dependencies([local_var_init_op]): + variable_manager_init_ops.extend(self.variable_mgr.get_post_init_ops()) + if ((not self.single_session) and (not self.distributed_collective) and + self.job_name and self.params.cross_replica_sync): + # Ensure all workers execute variable_manager_init_ops before they start + # executing the model. + variable_manager_init_ops.append( + self.add_sync_queues_and_barrier('init_ops_end_', + variable_manager_init_ops)) + local_var_init_op_group = tf.group(*variable_manager_init_ops, + name='local_var_init_op_group') + summary_op = tf.summary.merge_all() + + return GraphInfo( + input_producer_op=input_producer_op, + enqueue_ops=enqueue_ops, + fetches=fetches, + execution_barrier=execution_barrier, + global_step=global_step, + local_var_init_op_group=local_var_init_op_group, + summary_op=summary_op) + + def _benchmark_graph(self, graph_info, eval_graph_info): + """Benchmark the training graph. + + Args: + graph_info: the namedtuple returned by _build_graph() which + contains all necessary information to benchmark the graph, including + named tensors/ops list, fetches, etc. + eval_graph_info: Similar to graph_info but for the eval graph if + --eval_during_training_* is used. Otherwise, None. + Returns: + Dictionary containing training statistics (num_workers, num_steps, + average_wall_time, images_per_sec). + """ + log_fn('Initializing graph') + if self.params.variable_update == 'horovod': + import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top + # First worker will be 'chief' - it will write summaries and + # save checkpoints. + is_chief = hvd.rank() == 0 + else: + is_chief = (not self.job_name or self.task_index == 0) + + summary_writer = None + if (is_chief and self.params.summary_verbosity and self.params.train_dir and + self.params.save_summaries_steps > 0): + summary_writer = tf.summary.FileWriter(self.params.train_dir, + tf.get_default_graph()) + + # We want to start the benchmark timer right after a image_producer barrier + # and avoids undesired waiting times on barriers. + if ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) % + self.batch_group_size) != 0: + self.num_warmup_batches = int( + math.ceil( + (self.num_warmup_batches + len(graph_info.enqueue_ops) - 1.0) / + (self.batch_group_size)) * self.batch_group_size - + len(graph_info.enqueue_ops) + 1) + log_fn('Round up warm up steps to %d to match batch_group_size' % + self.num_warmup_batches) + assert ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) % + self.batch_group_size) == 0 + # We run the summaries in the same thread as the training operations by + # passing in None for summary_op to avoid a summary_thread being started. + # Running summaries and training operations in parallel could run out of + # GPU memory. + if is_chief and not self.forward_only_and_freeze: + saver = tf.train.Saver( + self.variable_mgr.savable_variables(), + save_relative_paths=True, + max_to_keep=self.params.max_ckpts_to_keep) + else: + saver = None + ready_for_local_init_op = None + if self.job_name and not (self.single_session or + self.distributed_collective): + # In distributed mode, we don't want to run local_var_init_op_group until + # the global variables are initialized, because local_var_init_op_group + # may use global variables (such as in distributed replicated mode). We + # don't set this in non-distributed mode, because in non-distributed mode, + # local_var_init_op_group may itself initialize global variables (such as + # in replicated mode). + ready_for_local_init_op = tf.report_uninitialized_variables( + tf.global_variables()) + if self.params.variable_update == 'horovod': + import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top + bcast_global_variables_op = hvd.broadcast_global_variables(0) + else: + bcast_global_variables_op = None + + if self.params.variable_update == 'collective_all_reduce': + # It doesn't matter what this collective_graph_key value is, + # so long as it's > 0 and the same at every worker. + init_run_options = tf.RunOptions() + init_run_options.experimental.collective_graph_key = 6 + else: + init_run_options = tf.RunOptions() + local_var_init_ops = [graph_info.local_var_init_op_group] + if eval_graph_info: + # `eval_graph_info.local_var_init_op_group` also includes some of the + # training initializer ops, since it's difficult to filter them out. + # Rerunning the training initializer ops is OK, but we add a control + # dependency since running two sets of training initializer ops at the + # same time can cause race conditions. + with tf.control_dependencies(local_var_init_ops): + local_var_init_ops.append(eval_graph_info.local_var_init_op_group) + sv = tf.train.Supervisor( + # For the purpose of Supervisor, all Horovod workers are 'chiefs', + # since we want session to be initialized symmetrically on all the + # workers. + is_chief=is_chief or (self.params.variable_update == 'horovod' + or self.distributed_collective), + # Log dir should be unset on non-chief workers to prevent Horovod + # workers from corrupting each other's checkpoints. + logdir=self.params.train_dir if is_chief else None, + ready_for_local_init_op=ready_for_local_init_op, + local_init_op=local_var_init_ops, + saver=saver, + global_step=graph_info.global_step, + summary_op=None, + save_model_secs=self.params.save_model_secs, + summary_writer=summary_writer, + local_init_run_options=init_run_options) + + profiler = tf.profiler.Profiler() if self.params.tfprof_file else None + if self.graph_file is not None: + path, filename = os.path.split(self.graph_file) + as_text = filename.endswith('txt') + log_fn('Writing GraphDef as %s to %s' % ( # pyformat break + 'text' if as_text else 'binary', self.graph_file)) + tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True), + path, filename, as_text) + + start_standard_services = ( + self.params.train_dir or + self.dataset.queue_runner_required()) + target = self.cluster_manager.get_target() if self.cluster_manager else '' + with sv.managed_session( + master=target, + config=create_config_proto(self.params), + start_standard_services=start_standard_services) as sess: + # Anything that can potentially raise an OutOfRangeError with 'sess' MUST + # be under this try block. The managed_session() context manager silently + # ignores OutOfRangeError, so we must catch them and wrap them with + # a different exception type so that they can be propagated up to the + # caller. + try: + stats = self.benchmark_with_session( + sess, sv, graph_info, eval_graph_info, bcast_global_variables_op, + is_chief, summary_writer, profiler) + except tf.errors.OutOfRangeError: + raise RuntimeError( + 'Received OutOfRangeError. Wrapping in Runtime error to avoid ' + 'Supervisor from suppressing the error. Original OutOfRangeError ' + 'with traceback:\n' + traceback.format_exc()) + + sv.stop() + if profiler: + generate_tfprof_profile(profiler, self.params.tfprof_file) + return stats + + def benchmark_with_session(self, sess, supervisor, graph_info, + eval_graph_info, bcast_global_variables_op, + is_chief, summary_writer, profiler): + """Benchmarks the graph with the given session. + + Args: + sess: The session to benchmark the graph with + supervisor: The Supervisor that created the session. + graph_info: the namedtuple returned by _build_graph() which + contains all necessary information to benchmark the graph, including + named tensors/ops list, fetches, etc. + eval_graph_info: Similar to graph_info but for the eval graph if + --eval_during_training_every_n_steps is used. Otherwise, None. + bcast_global_variables_op: If Horovod is used, the op to broadcast the + global variables to all the processes. None if Horovod is not used. + is_chief: True if this is the chief process. + summary_writer: The SummaryWriter used to write summaries, or None if + summaries are not used. + profiler: The tf.profiler.Profiler, or None if tfprof is not used. + + Returns: + Dictionary containing training statistics (num_workers, num_steps, + average_wall_time, images_per_sec). + """ + if self.params.backbone_model_path is not None: + self.model.load_backbone_model(sess, self.params.backbone_model_path) + if bcast_global_variables_op: + sess.run(bcast_global_variables_op) + image_producer = None + if graph_info.input_producer_op is not None: + image_producer = cnn_util.ImageProducer( + sess, graph_info.input_producer_op, self.batch_group_size, + self.params.use_python32_barrier) + image_producer.start() + if graph_info.enqueue_ops: + for i in xrange(len(graph_info.enqueue_ops)): + sess.run(graph_info.enqueue_ops[:(i + 1)]) + if image_producer is not None: + image_producer.notify_image_consumption() + self.init_global_step, = sess.run([graph_info.global_step]) + if self.job_name and not self.params.cross_replica_sync: + # TODO(zhengxq): Do we need to use a global step watcher at all? + global_step_watcher = GlobalStepWatcher( + sess, graph_info.global_step, + self.num_workers * self.num_warmup_batches + + self.init_global_step, + self.num_workers * (self.num_warmup_batches + self.num_batches) - 1) + global_step_watcher.start() + else: + global_step_watcher = None + eval_image_producer = None + if eval_graph_info: + # We pass local_var_init_op_group=None because the Supervisor already + # initialized local variables above. We need to have the Supervisor + # initialize the local variables, because otherwise it throws an error + # complaining that not all variables were initialized. + eval_image_producer = self._initialize_eval_graph( + eval_graph_info.enqueue_ops, eval_graph_info.input_producer_op, + local_var_init_op_group=None, sess=sess) + step_train_times = [] + log_fn('Running warm up') + local_step = -1 * self.num_warmup_batches + if self.single_session: + # In single session mode, each step, the global_step is incremented by + # 1. In non-single session mode, each step, the global_step is + # incremented once per worker. This means we need to divide + # init_global_step by num_workers only in non-single session mode. + end_local_step = self.num_batches - self.init_global_step + else: + end_local_step = self.num_batches - (self.init_global_step // + self.num_workers) + if not global_step_watcher: + # In cross-replica sync mode, all workers must run the same number of + # local steps, or else the workers running the extra step will block. + done_fn = lambda: local_step >= end_local_step + else: + done_fn = global_step_watcher.done + if self.params.debugger is not None: + if self.params.debugger == 'cli': + log_fn('The CLI TensorFlow debugger will be used.') + sess = tf_debug.LocalCLIDebugWrapperSession(sess) + else: + log_fn('The TensorBoard debugger plugin will be used.') + sess = tf_debug.TensorBoardDebugWrapperSession(sess, + self.params.debugger) + mlperf.logger.log(key=mlperf.tags.TRAIN_LOOP) + skip_final_eval = False + accuracy_at_1 = None + accuracy_at_5 = None + last_eval_step = local_step + loop_start_time = time.time() + last_average_loss = None + while not done_fn(): + if local_step == 0: + log_fn('Done warm up') + if graph_info.execution_barrier: + log_fn('Waiting for other replicas to finish warm up') + sess.run([graph_info.execution_barrier]) + + # TODO(laigd): rename 'Img' to maybe 'Input'. + header_str = ('Step\tImg/sec\t' + + self.params.loss_type_to_report.replace('/', ' ')) + if self.params.print_training_accuracy or self.params.forward_only: + # TODO(laigd): use the actual accuracy op names of the model. + header_str += '\ttop_1_accuracy\ttop_5_accuracy' + log_fn(header_str) + assert len(step_train_times) == self.num_warmup_batches + # reset times to ignore warm up batch + step_train_times = [] + loop_start_time = time.time() + if (summary_writer and + (local_step + 1) % self.params.save_summaries_steps == 0): + fetch_summary = graph_info.summary_op + else: + fetch_summary = None + collective_graph_key = 7 if ( + self.params.variable_update == 'collective_all_reduce') else 0 + (summary_str, last_average_loss) = benchmark_one_step( + sess, graph_info.fetches, local_step, + self.batch_size * (self.num_workers + if self.single_session else 1), step_train_times, + self.trace_filename, self.params.partitioned_graph_file_prefix, + profiler, image_producer, self.params, fetch_summary, + benchmark_logger=self.benchmark_logger, + collective_graph_key=collective_graph_key) + if summary_str is not None and is_chief: + supervisor.summary_computed(sess, summary_str) + local_step += 1 + if (self.params.save_model_steps and + local_step % self.params.save_model_steps == 0 and + local_step > 0 and + is_chief): + supervisor.saver.save(sess, supervisor.save_path, + supervisor.global_step) + if (eval_graph_info and local_step > 0 and not done_fn() and + self._should_eval_during_training(local_step)): + python_global_step = sess.run(graph_info.global_step) + num_steps_since_last_eval = local_step - last_eval_step + # The INPUT_SIZE tag value might not match the + # PREPROC_NUM_TRAIN_EXAMPLES tag value, because the number of examples + # run, which is INPUT_SIZE, is rounded up to the nearest multiple of + # self.batch_size. + mlperf.logger.log( + key=mlperf.tags.INPUT_SIZE, + value=num_steps_since_last_eval * self.batch_size) + log_fn('Running evaluation at global_step {}'.format( + python_global_step)) + accuracy_at_1, accuracy_at_5 = self._eval_once( + sess, summary_writer, eval_graph_info.fetches, + eval_graph_info.summary_op, eval_image_producer, + python_global_step) + last_eval_step = local_step + if (self.params.stop_at_top_1_accuracy and + accuracy_at_1 >= self.params.stop_at_top_1_accuracy): + log_fn('Stopping, as eval accuracy at least %s was reached' % + self.params.stop_at_top_1_accuracy) + skip_final_eval = True + break + else: + log_fn('Resuming training') + if eval_graph_info and self.model.reached_target(): + log_fn('Stopping, as the model indicates its custom goal was reached') + skip_final_eval = True + break + loop_end_time = time.time() + # Waits for the global step to be done, regardless of done_fn. + if global_step_watcher: + while not global_step_watcher.done(): + time.sleep(.25) + if not global_step_watcher: + elapsed_time = loop_end_time - loop_start_time + average_wall_time = elapsed_time / local_step if local_step > 0 else 0 + images_per_sec = (self.num_workers * local_step * self.batch_size / + elapsed_time) + num_steps = local_step * self.num_workers + else: + # NOTE: Each worker independently increases the global step. So, + # num_steps will be the sum of the local_steps from each worker. + num_steps = global_step_watcher.num_steps() + elapsed_time = global_step_watcher.elapsed_time() + average_wall_time = (elapsed_time * self.num_workers / num_steps + if num_steps > 0 else 0) + images_per_sec = num_steps * self.batch_size / elapsed_time + + # We skip printing images/sec if --eval_during_training_* is specified, + # because we are both processing training and evaluation images, so a + # singular "images/sec" value is meaningless. + if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL: + log_fn('-' * 64) + # TODO(laigd): rename 'images' to maybe 'inputs'. + log_fn('total images/sec: %.2f' % images_per_sec) + log_fn('-' * 64) + else: + log_fn('Done with training') + num_steps_since_last_eval = local_step - last_eval_step + mlperf.logger.log( + key=mlperf.tags.INPUT_SIZE, + value=num_steps_since_last_eval * self.batch_size) + python_global_step = sess.run(graph_info.global_step) + if eval_graph_info and not skip_final_eval: + log_fn('Running final evaluation at global_step {}'.format( + python_global_step)) + accuracy_at_1, accuracy_at_5 = self._eval_once( + sess, summary_writer, eval_graph_info.fetches, + eval_graph_info.summary_op, eval_image_producer, python_global_step) + num_epochs_ran = (python_global_step * self.batch_size / + self.dataset.num_examples_per_epoch('train')) + mlperf.logger.log_train_epochs(num_epochs_ran) + if image_producer is not None: + image_producer.done() + if eval_image_producer is not None: + eval_image_producer.done() + if is_chief: + if self.benchmark_logger: + self.benchmark_logger.log_metric( + 'average_examples_per_sec', images_per_sec, global_step=num_steps) + + # Save the model checkpoint. + if self.params.train_dir is not None and is_chief: + checkpoint_path = os.path.join(self.params.train_dir, 'model.ckpt') + if not gfile.Exists(self.params.train_dir): + gfile.MakeDirs(self.params.train_dir) + supervisor.saver.save(sess, checkpoint_path, graph_info.global_step) + if graph_info.execution_barrier: + # Wait for other workers to reach the end, so this worker doesn't + # go away underneath them. + sess.run([graph_info.execution_barrier]) + stats = { + 'num_workers': self.num_workers, + 'num_steps': num_steps, + 'average_wall_time': average_wall_time, + 'images_per_sec': images_per_sec + } + if last_average_loss is not None: + stats['last_average_loss'] = last_average_loss + if accuracy_at_1 is not None: + stats['top_1_accuracy'] = accuracy_at_1 + if accuracy_at_5 is not None: + stats['top_5_accuracy'] = accuracy_at_5 + + success = bool(self.model.reached_target() or + (accuracy_at_1 and self.params.stop_at_top_1_accuracy and + accuracy_at_1 >= self.params.stop_at_top_1_accuracy)) + mlperf.logger.log(key=mlperf.tags.RUN_STOP, value={'success': success}) + mlperf.logger.log(key=mlperf.tags.RUN_FINAL) + return stats + + def _should_eval_during_training(self, step): + """Return True iff should run eval during training at current step.""" + + assert self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL + + if self.params.eval_during_training_every_n_steps: + return step % self.params.eval_during_training_every_n_steps == 0 + + # All other --eval_during_training_* flags are converted to step numbers + # at which the model should run evaluation during training. + return step in self.eval_during_training_at_specified_steps + + def _preprocess_graph(self, graph, graph_info): + """Preprocess the graph before executing. + + Depending on the params, it runs various preprocessing on the graph, + including freezing, TensorRT conversion, etc. + + Args: + graph: the graph to preprocess. + graph_info: the namedtuple returned by _build_graph() which + contains all necessary information to benchmark the graph, including + named tensors/ops list, fetches, etc. + + Returns: + The updated graph and graph_info with the ops/tensors/fetches updated + according to the imported graph. + """ + assert isinstance(graph_info.fetches, dict) + assert isinstance(graph_info.global_step, tf.Variable) + if not self.forward_only_and_freeze: + return (graph, graph_info) + + # Get the names of the ops that need to keep during conversion. + flattened_op_names = list( + set([ + v.name.split(':')[0] + for v in nest.flatten(graph_info) + if v is not None + ])) + # Get variables that we don't want to freeze. + # Only keep unfreezable variables in forward_only_and_freeze mode. + # TODO(laigd): consider making global_step a constant. + variables_to_keep = {graph_info.global_step: tf.GraphKeys.GLOBAL_VARIABLES} + variables_to_keep.update({ + local_variable: tf.GraphKeys.LOCAL_VARIABLES + for local_variable in self._unfreezable_local_variables(graph) + }) + + variable_initializers = [ + variable.initializer.name for variable in variables_to_keep] + output_node_names = ( + flattened_op_names + + # Add variable initializer and read ops to the output list, so + # convert_variables_to_constants() will keep them. + variable_initializers + + [variable.value().op.name for variable in variables_to_keep]) + graphdef = graph.as_graph_def(add_shapes=True) + + # Freeze the graph. + with graph.as_default(): + with tf.Session(config=create_config_proto(self.params)) as sess: + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) + graphdef = graph_util.convert_variables_to_constants( + sess, + graphdef, + output_node_names, + variable_names_blacklist=[ + variable.op.name for variable in variables_to_keep + ]) + + # Run TensorRT conversion. + if self.params.trt_mode: + # Import here instead of at top, because this will crash if TensorRT is + # not installed + from tensorflow.python.compiler.tensorrt import trt_convert # pylint: disable=g-import-not-at-top + # Avoid TF-TRT bridge from touching all variable initializer ops and their + # dependencies, since they can directly be fetched by sess.run()s that + # initialize the variables. + # pylint: disable=protected-access + name_to_input_name, _, _ = graph_util_impl._extract_graph_summary( + graphdef) + initializer_subgraph_ops = graph_util_impl._bfs_for_reachable_nodes( + variable_initializers, name_to_input_name) + # pylint: enable=protected-access + + graphdef = trt_convert.create_inference_graph( + graphdef, + outputs=output_node_names + list(initializer_subgraph_ops), + max_batch_size=self.model.get_batch_size(), + max_workspace_size_bytes=self.params.trt_max_workspace_size_bytes, + precision_mode=self.params.trt_mode) + + # Creates a new graph as the default and import the converted graph back. + updated_graph = tf.Graph() + + def _get_tensors_or_ops(inputs): + """Gets the updated tensors or ops from 'updated_graph'.""" + + def _get_fn(element): + if element is None: + return None + if ':' in element.name: + return updated_graph.get_tensor_by_name(element.name) + return updated_graph.get_operation_by_name(element.name) + + if isinstance(inputs, (list, dict, tuple)): + return nest.map_structure(_get_fn, inputs) + else: + return _get_fn(inputs) + + with updated_graph.as_default(): + importer.import_graph_def(graph_def=graphdef, name='') + + # Update the variables + for variable in variables_to_keep: + updated_variable = tf.Variable.from_proto(variable.to_proto()) + tf.add_to_collection(variables_to_keep[variable], updated_variable) + if variable is graph_info.global_step: + updated_global_step = updated_variable + + updated_graph_info = GraphInfo( + input_producer_op=_get_tensors_or_ops(graph_info.input_producer_op), + enqueue_ops=_get_tensors_or_ops(graph_info.enqueue_ops), + execution_barrier=_get_tensors_or_ops(graph_info.execution_barrier), + local_var_init_op_group=_get_tensors_or_ops( + graph_info.local_var_init_op_group), + fetches=_get_tensors_or_ops(graph_info.fetches), + global_step=updated_global_step, + summary_op=None) + return (updated_graph, updated_graph_info) + + def _build_input_processing(self, shift_ratio=0): + """"Build the image (pre)processing portion of the model graph. + + Args: + shift_ratio: shift_ratio for data_flow_ops.RecordInput. + + Returns: + An InputProcessingInfo containing all the input sources to the model. + """ + input_processing_info = InputProcessingInfo( + input_producer_op=None, + input_producer_stages=None, + multi_device_iterator_input=None) + + mlperf.logger.log(key=mlperf.tags.INPUT_ORDER) + if not self._doing_eval: + mlperf.logger.log(key=mlperf.tags.INPUT_BATCH_SIZE, value=self.batch_size) + + # If using synthetic gpu inputs, do nothing on the cpu side. + if self.dataset.use_synthetic_gpu_inputs(): + assert not self.datasets_use_prefetch + return input_processing_info + + if self._doing_eval: + input_preprocessor = self.eval_input_preprocessor + mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES, + value=self.dataset.num_examples_per_epoch('validation')) + else: + input_preprocessor = self.input_preprocessor + mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES, + value=self.dataset.num_examples_per_epoch('train')) + + # Use prefetching mechanism provided by dataset input pipeline. + if self.datasets_use_prefetch: + multi_device_iterator = ( + input_preprocessor.build_multi_device_iterator( + self.batch_size, len(self.devices), self.cpu_device, self.params, + self.raw_devices, self.dataset, self._doing_eval)) + return input_processing_info._replace( + multi_device_iterator_input=multi_device_iterator.get_next()) + + # Not using dataset prefetching. Use a staging area to mimic the prefetching + # behavior instead. + with tf.device(self.cpu_device): + if self._doing_eval: + subset = 'validation' + else: + subset = 'train' + input_list = input_preprocessor.minibatch( + self.dataset, + subset=subset, + params=self.params, + shift_ratio=shift_ratio) + + input_producer_op = [] + input_producer_stages = [] + for device_num in range(len(self.devices)): + staging_area = data_flow_ops.StagingArea( + [parts[0].dtype for parts in input_list], + shapes=[parts[0].get_shape() for parts in input_list], + shared_name='input_producer_staging_area_%d_eval_%s' % + (device_num, self._doing_eval)) + input_producer_stages.append(staging_area) + for group_index in xrange(self.batch_group_size): + batch_index = group_index + device_num * self.batch_group_size + put_op = staging_area.put( + [parts[batch_index] for parts in input_list]) + input_producer_op.append(put_op) + assert input_producer_op + + return input_processing_info._replace( + input_producer_op=input_producer_op, + input_producer_stages=input_producer_stages) + + def _maybe_initialize_fp16(self): + """Initialize fp16 settings.""" + if self.params.use_fp16 and not self._doing_eval: + init_loss_scale_val = float(self.params.fp16_loss_scale or + self.model.get_fp16_loss_scale()) + self.loss_scale = None + self.loss_scale_normal_steps = None + if self.enable_auto_loss_scale or init_loss_scale_val != 1: + self.loss_scale = tf.get_variable( + name='loss_scale', + initializer=init_loss_scale_val, + dtype=tf.float32, + trainable=False) + if self.enable_auto_loss_scale: + self.loss_scale_normal_steps = tf.get_variable( + name='loss_scale_normal_steps', initializer=0, trainable=False) + + def _build_model(self): + """Build the TensorFlow graph.""" + if self.datasets_use_prefetch: + assert not self.params.staged_vars + assert not self.variable_mgr.supports_staged_vars() + + # Adjust seed so different workers start read different input files. + if self.params.variable_update == 'horovod': + import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top + seed_adjustment = hvd.rank() + else: + seed_adjustment = 0 + mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED, + value=self.params.tf_random_seed + seed_adjustment) + tf.set_random_seed(self.params.tf_random_seed + seed_adjustment) + mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED, + value=4321 + seed_adjustment) + np.random.seed(4321 + seed_adjustment) + phase_train = not (self._doing_eval or self.params.forward_only) + + if self._doing_eval: + mode_string = 'evaluation' + else: + mode_string = 'training' + + log_fn('Generating {} model'.format(mode_string)) + losses = [] + device_grads = [] + all_logits = [] + all_accuracy_ops = {} + gpu_compute_stage_ops = [] + gpu_grad_stage_ops = [] + + with tf.device(self.global_step_device): + global_step = tf.train.get_or_create_global_step() + self._maybe_initialize_fp16() + + # Build the processing and model for the worker. + input_producer_op = None + with tf.name_scope('input_processing'): + input_processing_info = self._build_input_processing(shift_ratio=0) + if input_processing_info.input_producer_op is not None: + input_producer_op = tf.group(*input_processing_info.input_producer_op) + update_ops = None + staging_delta_ops = [] + + for device_num in range(len(self.devices)): + with tf.name_scope('tower_%i' % device_num) as name_scope, ( + self.variable_mgr.create_outer_variable_scope(device_num)): + results = self.add_forward_pass_and_gradients( + phase_train, device_num, device_num, input_processing_info, + gpu_compute_stage_ops, gpu_grad_stage_ops) + + if self.params.backbone_model_path: + self.model.add_backbone_saver() + + if phase_train: + losses.append(results['loss']) + device_grads.append(results['gradvars']) + else: + all_logits.append(results['logits']) + if not phase_train or self.params.print_training_accuracy: + for name, op in results.items(): + if name.startswith('accuracy:'): + key = name[9:] + if key not in all_accuracy_ops: + all_accuracy_ops[key] = [] + all_accuracy_ops[key].append(op) + + if device_num == 0: + # Retain the Batch Normalization updates operations only from the + # first tower. These operations update the moving mean and moving + # variance variables, which are updated (but not used) during + # training, and used during evaluation. The moving mean and variance + # approximate the true mean and variance across all images in the + # dataset. Therefore, in replicated mode, these moving averages would + # be almost identical for each tower, and so we only update and save + # the moving averages for one tower. In parameter server mode, all + # towers share a copy of the variables so we also only need to update + # and save the moving averages once. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope) + if self.datasets_use_prefetch: + assert not self.variable_mgr.staging_delta_ops + else: + staging_delta_ops = list(self.variable_mgr.staging_delta_ops) + + enqueue_ops = [] + if not self.datasets_use_prefetch: + if self.variable_mgr.supports_staged_vars(): + for staging_ops in self.variable_mgr.staging_vars_on_devices: + gpu_compute_stage_ops.extend( + [put_op for _, (put_op, _) in six.iteritems(staging_ops)]) + enqueue_ops.append(tf.group(*gpu_compute_stage_ops, + name='gpu_compute_stage_ops_group')) + if gpu_grad_stage_ops: + staging_delta_ops += gpu_grad_stage_ops + if staging_delta_ops: + enqueue_ops.append(tf.group(*(staging_delta_ops))) + + if (self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL and + self.params.variable_update == 'replicated'): + # We need to get all the update ops instead of only those for the first + # tower. This is because during evaluation, each tower will read from its + # own tower's moving averages instead of the first tower's moving + # averages. + # TODO(reedwm): Have each tower read from the first tower's moving + # averages for a slight performance gain. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + mlperf.logger.log(key=mlperf.tags.INPUT_BN_SPAN, + value=self.batch_size // len(self.raw_devices)) + + fetches = self._build_fetches(global_step, all_logits, losses, device_grads, + enqueue_ops, update_ops, all_accuracy_ops, + phase_train) + fetches['all_logits'] = all_logits + return (input_producer_op, enqueue_ops, fetches) + + def _build_fetches(self, global_step, all_logits, losses, device_grads, + enqueue_ops, update_ops, all_accuracy_ops, phase_train): + """Complete construction of model graph, populating the fetches map.""" + fetches = {} + if enqueue_ops: + fetches['enqueue_ops'] = enqueue_ops + for name, ops in all_accuracy_ops.items(): + # For fetches that starts with 'tensor:', keep dimension and skip reducing + # them to scalars. + if name.startswith(constants.UNREDUCED_ACCURACY_OP_PREFIX): + key = name[len(constants.UNREDUCED_ACCURACY_OP_PREFIX):] + fetches[key] = tf.concat(ops, 0) + else: + fetches[name] = tf.reduce_sum(ops) / self.batch_size + if self.task_index == 0 and self.params.summary_verbosity >= 1: + tf.summary.scalar(name, fetches[name]) + + if not phase_train: + if self.params.forward_only: + fetches['all_logits'] = tf.concat(all_logits, 0) + return fetches + apply_gradient_devices, gradient_state = ( + self.variable_mgr.preprocess_device_grads(device_grads)) + + # TODO(reedwm): Greatly simplify the learning rate code. + if (self.params.variable_update == 'horovod' or + self.params.variable_update == 'collective_all_reduce'): + # Each worker independently increments global_step. + examples_per_step = self.batch_size * self.num_workers + else: + # global_step is shared by all workers, and so every iteration + # global_step is incremented by num_workers. + examples_per_step = self.batch_size + if self.params.compute_lr_on_cpu: + with tf.device(self.cpu_device): + learning_rate = get_learning_rate(self.params, global_step, + self.dataset.num_examples_per_epoch(), + self.model, examples_per_step) + + training_ops = [] + for d, device in enumerate(apply_gradient_devices): + with tf.device(device): + with tf.name_scope('average_loss'): + average_loss = tf.reduce_mean(losses) + with tf.name_scope('get_gradients_to_apply'): + avg_grads = self.variable_mgr.get_gradients_to_apply(d, + gradient_state) + + if not self.params.compute_lr_on_cpu: + # We compute the learning rate once for each device in + # `apply_gradient_devices`. + learning_rate = get_learning_rate( + self.params, global_step, self.dataset.num_examples_per_epoch(), + self.model, examples_per_step) + gradient_clip = self.params.gradient_clip + if gradient_clip is not None: + with tf.name_scope('clip_gradients'): + clipped_grads = [(tf.clip_by_value(grad, -gradient_clip, + +gradient_clip), var) + for grad, var in avg_grads] + else: + clipped_grads = avg_grads + + learning_rate = tf.identity(learning_rate, name='learning_rate_tensor') + opt = get_optimizer(self.params, learning_rate) + loss_scale_params = variable_mgr_util.AutoLossScaleParams( + enable_auto_loss_scale=self.enable_auto_loss_scale, + loss_scale=self.loss_scale, + loss_scale_normal_steps=self.loss_scale_normal_steps, + inc_loss_scale_every_n=self.params.fp16_inc_loss_scale_every_n, + is_chief=not self.job_name or self.task_index == 0) + + with tf.name_scope('append_apply_gradient_ops'): + self.variable_mgr.append_apply_gradients_ops( + gradient_state, opt, clipped_grads, training_ops, + loss_scale_params) + train_op = tf.group(*(training_ops + update_ops), name='train_ops_group') + + with tf.device(self.cpu_device): + if self.task_index == 0 and self.params.summary_verbosity >= 1: + tf.summary.scalar('learning_rate', learning_rate) + tf.summary.scalar(self.params.loss_type_to_report, average_loss) + if self.loss_scale is not None: + tf.summary.scalar('loss_scale', self.loss_scale) + if self.loss_scale_normal_steps: + tf.summary.scalar('loss_scale_normal_steps', + self.loss_scale_normal_steps) + + if self.params.summary_verbosity >= 2: + self.gradient_histogram_summary(avg_grads) + + if self.params.summary_verbosity >= 3: + for grad, var in avg_grads: + if grad is not None: + tf.summary.histogram(var.op.name + '/gradients', grad) + for var in tf.trainable_variables(): + tf.summary.histogram(var.op.name, var) + + fetches['train_op'] = train_op + fetches['average_loss'] = average_loss + return fetches + + def gradient_histogram_summary(self, avg_grads): + """Create histogram of log values of all non-zero gradients.""" + with tf.name_scope('log_gradients_summary'): + all_grads = [] + for grad, _ in avg_grads: + all_grads.append(tf.reshape(grad, [-1])) + grads = tf.abs(tf.concat(all_grads, 0)) + # exclude grads with zero values. + indices_for_non_zero_grads = tf.where(tf.not_equal(grads, 0)) + log_grads = tf.reshape( + tf.log(tf.gather(grads, indices_for_non_zero_grads)), [-1]) + tf.summary.histogram('log_gradients', log_grads) + + def _build_model_single_session(self): + """Build the TensorFlow graph for multiple replicas in a single_session. + + Returns: + input_producer_op: + enqueue_ops: + fetches: + + Raises: + ValueError: optimizer not recognized. + + Single session runs multiple model replicas as part of one large + distributed graph, whose global execution is always step-synchronized. + """ + # verify assumptions + assert self.params.task_index == 0 + assert not self._doing_eval + assert not self.params.forward_only + assert not self.params.staged_vars + + tf.set_random_seed(self.params.tf_random_seed) + np.random.seed(4321) + phase_train = True + + log_fn('Generating training model') + losses = [] + device_grads = [] + all_logits = [] + all_accuracy_ops = {} + gpu_compute_stage_ops = [] + gpu_grad_stage_ops = [] + + with tf.device(self.global_step_device): + global_step = tf.train.get_or_create_global_step() + + update_ops = [] + global_input_producer_op = [] + + is_local = not self.job_name + if is_local: + assert self.num_workers == 1 + for task_num in range(self.num_workers): + # Reset the devices that self.variable_mgr knows about to those + # belonging to the next worker (task). + self.reset_devices_for_task(task_num, is_local) + # Build the per-worker image processing + with tf.name_scope('input_processing'): + input_processing_info = self._build_input_processing( + shift_ratio=(task_num / self.num_workers)) + if input_processing_info.input_producer_op is not None: + global_input_producer_op.extend(input_processing_info.input_producer_op) + # Build the per-worker model replica. + for rel_device_num in range(len(self.devices)): + abs_device_num = task_num * len(self.devices) + rel_device_num + with self.variable_mgr.create_outer_variable_scope( + abs_device_num), tf.name_scope( + 'task_%i_tower_%i' % (task_num, rel_device_num)) as name_scope: + task_results = self.add_forward_pass_and_gradients( + phase_train, rel_device_num, abs_device_num, + input_processing_info, gpu_compute_stage_ops, gpu_grad_stage_ops) + + if self.params.backbone_model_path: + self.model.add_backbone_saver() + + if phase_train: + losses.append(task_results['loss']) + device_grads.append(task_results['gradvars']) + else: + all_logits.append(task_results['logits']) + if not phase_train or self.params.print_training_accuracy: + for name, op in task_results.items(): + if name.startswith('accuracy:'): + key = name[9:] + if key not in all_accuracy_ops: + all_accuracy_ops[key] = [] + all_accuracy_ops[key].append(op) + + if rel_device_num == 0: + # Retain the Batch Normalization updates operations only + # from the first tower. These operations update the moving + # mean and moving variance variables, which are updated + # (but not used) during training, and used during + # evaluation. The moving mean and variance approximate the + # true mean and variance across all images in the + # dataset. Therefore, in replicated mode, these moving + # averages would be almost identical for each tower, and + # so we only update and save the moving averages for one + # tower. In parameter server mode, all towers share a copy + # of the variables so we also only need to update and save + # the moving averages once. + update_ops.extend( + tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)) + assert not self.variable_mgr.staging_delta_ops + + enqueue_ops = [] + if gpu_compute_stage_ops: + enqueue_ops.append(tf.group(*gpu_compute_stage_ops, + name='gpu_compute_stage_ops')) + assert not self.variable_mgr.supports_staged_vars() + assert not gpu_grad_stage_ops + + fetches = self._build_fetches(global_step, all_logits, losses, device_grads, + enqueue_ops, update_ops, all_accuracy_ops, + phase_train) + if global_input_producer_op: + global_input_producer_op = tf.group(*global_input_producer_op) + else: + global_input_producer_op = None + return (global_input_producer_op, enqueue_ops, fetches) + + def add_forward_pass_and_gradients(self, + phase_train, + rel_device_num, + abs_device_num, + input_processing_info, + gpu_compute_stage_ops, + gpu_grad_stage_ops): + """Add ops for forward-pass and gradient computations.""" + nclass = self.dataset.num_classes + if self.datasets_use_prefetch: + assert input_processing_info.multi_device_iterator_input, ( + 'multi_device_iterator_input cannot be None if ' + 'datasets_use_prefetch=True') + input_list = ( + input_processing_info.multi_device_iterator_input[rel_device_num]) + else: + if not self.dataset.use_synthetic_gpu_inputs(): + input_producer_stage = input_processing_info.input_producer_stages[ + rel_device_num] + with tf.device(self.cpu_device): + host_input_list = input_producer_stage.get() + with tf.device(self.raw_devices[rel_device_num]): + gpu_compute_stage = data_flow_ops.StagingArea( + [inp.dtype for inp in host_input_list], + shapes=[inp.get_shape() for inp in host_input_list]) + # The CPU-to-GPU copy is triggered here. + gpu_compute_stage_op = gpu_compute_stage.put(host_input_list) + input_list = gpu_compute_stage.get() + gpu_compute_stage_ops.append(gpu_compute_stage_op) + else: + with tf.device(self.raw_devices[rel_device_num]): + # Minor hack to avoid H2D copy when using synthetic data + input_list = self.model.get_synthetic_inputs( + BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME, nclass) + + # Labels reshaping happens all on gpu:0. Reshaping synthetic labels on + # multiple devices slows down XLA computation for an unknown reason. + # TODO(b/116875203): Find/address root cause of XLA slow down. + labels_device_placement_hack = ( + self.dataset.use_synthetic_gpu_inputs() and self.params.xla_compile) + + def device_aware_reshape(tensor, shape): + device = self.devices[rel_device_num] + # Labels are int32, place reshapes on gpu:0 (no device placement) when the + # hack is enabled. + if labels_device_placement_hack and tensor.dtype == tf.int32: + device = '' + with tf.device(device): + return tf.reshape(tensor, shape=shape) + + subset = 'validation' if self._doing_eval else 'train' + input_shapes = self.model.get_input_shapes(subset) + input_list = [ + device_aware_reshape(input_list[i], shape=input_shapes[i]) + for i in range(len(input_list)) + ] + + def forward_pass_and_gradients(): + """Builds forward pass and gradient computation network. + + When phase_train=True and print_training_accuracy=False: + return [loss] + grads + + When phase_train=True and print_training_accuracy=True: + return [logits, loss] + grads + + When phase_train=False, + return [logits] + + Its output can always be unpacked by + + ``` + outputs = forward_pass_and_gradients() + logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs) + ``` + + Returns: + outputs: A list of tensors depending on different modes. + """ + + build_network_result = self.model.build_network( + input_list, phase_train, nclass) + logits = build_network_result.logits + + if not phase_train: + return [logits] + + base_loss = self.model.loss_function(input_list, build_network_result) + params = self.variable_mgr.trainable_variables_on_device( + rel_device_num, abs_device_num) + l2_loss = None + total_loss = base_loss + with tf.name_scope('l2_loss'): + fp32_params = params + if self.model.data_type == tf.float16 and self.params.fp16_vars: + # fp16 reductions are very slow on GPUs, so cast to fp32 before + # calling tf.nn.l2_loss and tf.add_n. + # TODO(b/36217816): Once the bug is fixed, investigate if we should do + # this reduction in fp16. + fp32_params = (tf.cast(p, tf.float32) for p in params) + filtered_params = self.model.filter_l2_loss_vars(fp32_params) + if rel_device_num == len(self.devices) - 1: + # We compute the L2 loss for only one device instead of all of them, + # because the L2 loss for each device is the same. To adjust for this, + # we multiply the L2 loss by the number of devices. We choose the + # last device because for some reason, on a Volta DGX1, the first four + # GPUs take slightly longer to complete a step than the last four. + # TODO(reedwm): Shard the L2 loss computations across GPUs. + if self.params.single_l2_loss_op: + # TODO(reedwm): If faster, create a fused op that does the L2 loss + # on multiple tensors, and use that instead of concatenating + # tensors. + reshaped_params = [tf.reshape(p, (-1,)) for p in filtered_params] + l2_loss = tf.nn.l2_loss(tf.concat(reshaped_params, axis=0)) + else: + l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in filtered_params]) + weight_decay = self.params.weight_decay + mlperf.logger.log(key=mlperf.tags.OPT_WEIGHT_DECAY, value=weight_decay) + if (weight_decay is not None and weight_decay != 0. and + l2_loss is not None): + mlperf.logger.log(key=mlperf.tags.MODEL_L2_REGULARIZATION, + value=weight_decay) + total_loss += len(self.devices) * weight_decay * l2_loss + + aggmeth = tf.AggregationMethod.DEFAULT + scaled_loss = (total_loss if self.loss_scale is None + else total_loss * self.loss_scale) + grads = tf.gradients(scaled_loss, params, aggregation_method=aggmeth) + if self.params.sparse_to_dense_grads: + # Passing a sparse gradient to convert_to_tensor turns it into a dense + # gradient. A sparse gradient is an instance of tf.IndexedSlices. + # convert_to_tensor does not modify dense tensors. + grads = [tf.convert_to_tensor(g) for g in grads] + if self.loss_scale is not None: + # TODO(reedwm): If automatic loss scaling is not used, we could avoid + # these multiplications by directly modifying the learning rate instead. + # If this is done, care must be taken to ensure that this scaling method + # is correct, as some optimizers square gradients and do other + # operations which might not be compatible with modifying both the + # gradients and the learning rate. + + grads = [ + grad * tf.cast(1. / self.loss_scale, grad.dtype) for grad in grads + ] + + if self.params.variable_update == 'horovod': + import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top + if self.params.horovod_device: + horovod_device = '/%s:0' % self.params.horovod_device + else: + horovod_device = '' + # All-reduce gradients using Horovod. + grads = [hvd.allreduce(grad, average=False, device_dense=horovod_device) + for grad in grads] + + if self.params.staged_vars: + grad_dtypes = [grad.dtype for grad in grads] + grad_shapes = [grad.shape for grad in grads] + grad_stage = data_flow_ops.StagingArea(grad_dtypes, grad_shapes) + grad_stage_op = grad_stage.put(grads) + # In general, this decouples the computation of the gradients and + # the updates of the weights. + # During the pipeline warm up, this runs enough training to produce + # the first set of gradients. + gpu_grad_stage_ops.append(grad_stage_op) + grads = grad_stage.get() + + if self.params.loss_type_to_report == 'total_loss': + loss = total_loss + else: + loss = base_loss + + if self.params.print_training_accuracy: + return [logits, loss] + grads + else: + return [loss] + grads + + def unpack_forward_pass_and_gradients_output(forward_pass_and_grad_outputs): + """Unpacks outputs from forward_pass_and_gradients. + + Args: + forward_pass_and_grad_outputs: Output from forward_pass_and_gradients. + + Returns: + logits: Unscaled probability distribution from forward pass. + If unavailable, None is returned. + loss: Loss function result from logits. + If unavailable, None is returned. + grads: Gradients for all trainable variables. + If unavailable, None is returned. + """ + logits = None + # logits is only fetched in non-train mode or when + # print_training_accuracy is set. + if not phase_train or self.params.print_training_accuracy: + logits = forward_pass_and_grad_outputs.pop(0) + + loss = ( + forward_pass_and_grad_outputs[0] + if forward_pass_and_grad_outputs else None) + grads = ( + forward_pass_and_grad_outputs[1:] + if forward_pass_and_grad_outputs else None) + + return logits, loss, grads + + def make_results(logits, loss, grads): + """Generate results based on logits, loss and grads.""" + results = {} # The return value + + if logits is not None: + results['logits'] = logits + accuracy_ops = self.model.accuracy_function(input_list, logits) + for name, op in accuracy_ops.items(): + results['accuracy:' + name] = op + + if loss is not None: + results['loss'] = loss + + if grads is not None: + param_refs = self.variable_mgr.trainable_variables_on_device( + rel_device_num, abs_device_num, writable=True) + results['gradvars'] = list(zip(grads, param_refs)) + + return results + + with tf.device(self.devices[rel_device_num]): + outputs = maybe_compile(forward_pass_and_gradients, self.params) + logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs) + return make_results(logits, loss, grads) + + def get_input_preprocessor(self): + """Returns the image preprocessor to used, based on the model. + + Returns: + The image preprocessor, or None if synthetic data should be used. + """ + shift_ratio = 0 + if self.job_name: + # shift_ratio prevents multiple workers from processing the same batch + # during a step + shift_ratio = self.task_index / self.num_workers + + processor_class = self.dataset.get_input_preprocessor( + self.params.input_preprocessor) + assert processor_class + subset = 'validation' if self._doing_eval else 'train' + return processor_class( + self.batch_size * self.batch_group_size, + self.model.get_input_shapes(subset), + len(self.devices) * self.batch_group_size, + dtype=self.model.data_type, + train=(not self._doing_eval), + # TODO(laigd): refactor away image model specific parameters. + distortions=self.params.distortions, + resize_method=self.resize_method, + shift_ratio=shift_ratio, + summary_verbosity=self.params.summary_verbosity, + distort_color_in_yiq=self.params.distort_color_in_yiq, + fuse_decode_and_crop=self.params.fuse_decode_and_crop, + match_mlperf=self.params.ml_perf) + + def add_sync_queues_and_barrier(self, name_prefix, enqueue_after_list): + """Adds ops to enqueue on all worker queues. + + Args: + name_prefix: prefixed for the shared_name of ops. + enqueue_after_list: control dependency from ops. + + Returns: + An op that should be used as control dependency before starting next step. + """ + self.sync_queue_counter += 1 + with tf.device(self.sync_queue_devices[( + self.sync_queue_counter % len(self.sync_queue_devices))]): + sync_queues = [ + tf.FIFOQueue(self.num_workers, [tf.bool], shapes=[[]], + shared_name='%s%s' % (name_prefix, i)) + for i in range(self.num_workers)] + queue_ops = [] + # For each other worker, add an entry in a queue, signaling that it can + # finish this step. + token = tf.constant(False) + with tf.control_dependencies(enqueue_after_list): + for i, q in enumerate(sync_queues): + if i == self.task_index: + queue_ops.append(tf.no_op()) + else: + queue_ops.append(q.enqueue(token)) + + # Drain tokens off queue for this worker, one for each other worker. + queue_ops.append( + sync_queues[self.task_index].dequeue_many(len(sync_queues) - 1)) + + return tf.group(*queue_ops) + + +def _is_mkl_flag_absent(mkl_flag): + return not (absl_flags.FLAGS.is_parsed() and mkl_flag in absl_flags.FLAGS + and absl_flags.FLAGS[mkl_flag].present) + + +def _print_os_env_ignored_warning(mkl_flag, flag_default_val, os_env_var): + tf.logging.warn( + ('OS ENV variable %s=%s is ignored and script default: ' + '%s is used. Use --%s to override.') % + (os_env_var, os.environ[os_env_var], flag_default_val, mkl_flag)) + + +def set_default_param_values_and_env_vars(params): + """Sets up the default param values and environment variables .""" + if params.batchnorm_persistent: + os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' + else: + os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None) + if params.winograd_nonfused: + os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' + else: + os.environ.pop('TF_ENABLE_WINOGRAD_NONFUSED', None) + if params.autotune_threshold: + os.environ['TF_AUTOTUNE_THRESHOLD'] = str(params.autotune_threshold) + os.environ['TF_SYNC_ON_FINISH'] = str(int(params.sync_on_finish)) + argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # Sets environment variables for MKL + # If OS ENV vars are overridden by script defaults, a warning msg is printed. + if params.mkl: + mkl_flags = ['kmp_blocktime', 'kmp_settings', 'kmp_affinity', + 'num_intra_threads'] + for mkl_flag in mkl_flags: + os_env_var = mkl_flag.upper() + if mkl_flag == 'num_intra_threads': + os_env_var = 'OMP_NUM_THREADS' + flag_val = str(getattr(params, mkl_flag)) + if _is_mkl_flag_absent(mkl_flag) and os_env_var in os.environ: + _print_os_env_ignored_warning(mkl_flag, flag_val, os_env_var) + os.environ[os_env_var] = flag_val + if mkl_flag == 'num_intra_threads' and not params.num_intra_threads: + os.environ.pop(os_env_var, None) + + # Sets GPU thread settings + if params.device.lower() == 'gpu': + params = params._replace(gpu_thread_mode=params.gpu_thread_mode.lower()) + if params.gpu_thread_mode not in ['global', 'gpu_shared', 'gpu_private']: + raise ValueError('Invalid gpu_thread_mode: %s' % params.gpu_thread_mode) + os.environ['TF_GPU_THREAD_MODE'] = params.gpu_thread_mode + + if params.per_gpu_thread_count and params.gpu_thread_mode == 'global': + raise ValueError( + 'Invalid per_gpu_thread_count with gpu_thread_mode=global: %s' % + params.per_gpu_thread_count) + # Default to two threads. One for the device compute and the other for + # memory copies. + per_gpu_thread_count = params.per_gpu_thread_count or 2 + total_gpu_thread_count = per_gpu_thread_count * params.num_gpus + + if params.gpu_thread_mode == 'gpu_private': + os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count) + elif params.gpu_thread_mode == 'gpu_shared': + os.environ['TF_GPU_THREAD_COUNT'] = str(total_gpu_thread_count) + + cpu_count = multiprocessing.cpu_count() + if not params.num_inter_threads and params.gpu_thread_mode in [ + 'gpu_private', 'gpu_shared' + ]: + main_thread_count = max(cpu_count - total_gpu_thread_count, 1) + params = params._replace(num_inter_threads=main_thread_count) + + if (params.datasets_use_prefetch and + params.datasets_num_private_threads is None): + # From the total cpu thread count, subtract the total_gpu_thread_count, + # and then 2 threads per GPU device for event monitoring and sending / + # receiving tensors + num_monitoring_threads = 2 * params.num_gpus + num_private_threads = max( + cpu_count - total_gpu_thread_count - num_monitoring_threads, 1) + params = params._replace(datasets_num_private_threads=num_private_threads) + return params + + +def setup(params): + """Sets up the environment that BenchmarkCNN should run in. + + Args: + params: Params tuple, typically created by make_params or + make_params_from_flags. + + Returns: + A potentially modified params. + Raises: + ValueError: invalid parames combinations. + """ + # Set up environment variables before doing any other global initialization to + # make sure it uses the appropriate environment variables. + params = set_default_param_values_and_env_vars(params) + + # horovod needs to be initialized before create_config_proto() call since + # it will be used in config generation if enabled. + if params.variable_update == 'horovod': + import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top + hvd.init() + + platforms_util.initialize(params, create_config_proto(params)) + + if not params.job_name: + # Create a dummy session to initialize TF global variables using the input + # params. Otherwise, ListDevices function may create global devices using + # the default config instead of using the user provided config. + # + # TODO(hinsu): Find a way to achieve the same for distributed benchmark. It + # is not legal to create distributed session after local session. It is also + # not possible to create distributed session here as that results in + # multiple creation of ClusterManager and Server. + with tf.Session(config=create_config_proto(params)) as sess: + del sess + + return params + + +def maybe_compile(computation, params): + if params and params.xla_compile: + return tf.xla.experimental.compile(computation) + else: + return computation() diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test.py b/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test.py new file mode 100644 index 0000000000000000000000000000000000000000..43dac487f90e1014f9429b12a89fa93ac5ef19be --- /dev/null +++ b/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test.py @@ -0,0 +1,493 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests running benchmark_cnn in distributed mode. + +This is done by spawning one process per task. Each process runs +benchmark_cnn_distributed_test_runner.py. + +The output for each process is written to disk and can be viewed to debug tests. +See get_test_output_dir() in platforms/default/util.py for more info. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from collections import namedtuple +import os +import subprocess +import time +import unittest + +from absl import flags as absl_flags +import portpicker +import six +import tensorflow.compat.v1 as tf +import flags +import test_util +from platforms import util as platforms_util + +FLAGS = absl_flags.FLAGS + + +def _convert_params_to_flags_list(params): + """Converts Params to a list of flags. Skips default-valued parameters. + + E.g., converts + benchmark_cnn.make_params(batch_size=32, model='resnet50') + to + ['--batch_size=32', '--model=resnet50'] + + Args: + params: Params for BenchmarkCNN. + Returns: + A list of flags. + """ + return [ + '--%s=%s' % (k, str(v)) for k, v in six.iteritems(params._asdict()) + if v != flags.param_specs[k].default_value + ] + + +# When outputting a process's output in the log, maximum number of characters +# to output. The log system does not allow us to output more than this in a +# single log message, but this limit is also useful to avoid the logs from +# becoming too large (the full process output is written to disk). +MAX_OUTPUT_CHARS = 15000 + + +# A process. name is a string identifying the process in logs. stdout and +# stderr are file objects of the process's stdout and stderr, respectively. +_ProcessInfo = namedtuple('_ProcessInfo', ['name', 'popen', 'stdout', 'stderr']) + + +def _create_task_process(job_name, task_index, args, env, output_dir): + """Creates a process for a single task for benchmark_cnn. + + Args: + job_name: 'worker' or 'ps' or ''. Empty string used for non-distributed + mode. + task_index: The index of the task within the cluster. + args: A list of arguments to pass to the task. This function additionally + sets --task_index and --job_name + env: The environment to use for the task. + output_dir: Where to place the output files, storing the task's stdout and + stderr. + Returns: + A _ProcessInfo namedtuple of the running process. The stdout and stderr + fields of this tuple must be closed by the caller once the process ends. + """ + args = args[:] + args += ['--task_index=%s' % task_index, '--job_name=%s' % job_name] + name_prefix = job_name or 'local' + process_name = '%s_%s' % (name_prefix, task_index) + tf.logging.info('Spawning %s process: %s' % (process_name, ' '.join(args))) + stdout_filename = os.path.join(output_dir, '%s_stdout.txt' % process_name) + stderr_filename = os.path.join(output_dir, '%s_stderr.txt' % process_name) + stdout_file = open(stdout_filename, 'w+') + stderr_file = open(stderr_filename, 'w+') + popen = subprocess.Popen( + args, stdout=stdout_file, stderr=stderr_file, env=env) + return _ProcessInfo(process_name, popen, stdout_file, stderr_file) + + +def _wait_for_processes(wait_processes, kill_processes): + """Waits until all `wait_processes` finish, then kills `kill_processes`. + + Fails an assert if a process in `wait_processes` finishes unsuccessfully. + The processes in `kill_processes` are assumed to never finish so they are + killed. + + Args: + wait_processes: A list of _ProcessInfo tuples. This function will wait + for each to finish. + kill_processes: A list of _ProcessInfo tuples. Each will be killed once + every process in `wait_processes` is finished. + Returns: + A list of strings, each which is a string of the stdout of a wait process. + """ + wait_process_stdouts = [None] * len(wait_processes) + finished_wait_processes = set() + while len(finished_wait_processes) < len(wait_processes): + for i, wait_process in enumerate(wait_processes): + if i in finished_wait_processes: + continue + ret_code = wait_process.popen.poll() + if ret_code is None: + continue + tf.logging.info('{} finished'.format(wait_process.name)) + wait_process.stdout.seek(0) + wait_process_stdouts[i] = wait_process.stdout.read() + tf.logging.info('stdout for {} (last {} chars): {}\n'.format( + wait_process.name, MAX_OUTPUT_CHARS, + wait_process_stdouts[i][-MAX_OUTPUT_CHARS:])) + wait_process.stderr.seek(0) + tf.logging.info('stderr for {} (last {} chars): {}\n'.format( + wait_process.name, MAX_OUTPUT_CHARS, + wait_process.stderr.read()[-MAX_OUTPUT_CHARS:])) + assert ret_code == 0, 'Process failed with return code %d' % ret_code + finished_wait_processes.add(i) + for kill_process in kill_processes: + ret_code = kill_process.popen.poll() + # kill processes should not end until we kill them. + assert ret_code is None, 'Process returned early with code %d' % ret_code + time.sleep(0.25) + tf.logging.info('All wait processes finished') + for i, kill_process in enumerate(kill_processes): + # Kill each kill process. + kill_process.popen.kill() + kill_process.popen.wait() + kill_process.stdout.seek(0) + tf.logging.info('stdout for {} (last {} chars): {}\n'.format( + kill_process.name, MAX_OUTPUT_CHARS, + kill_process.stdout.read()[-MAX_OUTPUT_CHARS:])) + kill_process.stderr.seek(0) + tf.logging.info('stderr for {} (last {} chars): {}\n'.format( + kill_process.name, MAX_OUTPUT_CHARS, + kill_process.stderr.read()[-MAX_OUTPUT_CHARS:])) + return wait_process_stdouts + + +def _spawn_benchmark_processes(output_dir_path, num_workers, num_ps, + num_controllers, params): + """Run training or evaluation in spawned processes. + + Runs locally if num_workers == 1, num_ps == 0, and num_controllers == 0, + otherwise runs in distributed mode. In either case, one process is spawned + per worker and ps. Waits for training/evaluation to finish before returning. + + Args: + output_dir_path: Relative path where stdout and stderr files will be + placed. + num_workers: Number of workers to spawn. + num_ps: Number of ps processes to spawn. + num_controllers: Number of controller processes to spawn (must be 0 or 1). + params: Params for BenchmarkCNN in each subprocess. + Returns: + A list output_list of outputs from all processes that output the + images/sec and accuracy. This process is the controller host in + distributed_all_reduce, and the workers otherwise. output_list[i] is a + list of lines from the ith worker's stdout. + """ + run_distributed = num_workers != 1 or num_ps != 0 or num_controllers != 0 + if params.variable_update == 'distributed_all_reduce': + assert num_controllers == 1 or not run_distributed + assert num_ps == 0 + else: + assert num_controllers == 0 + output_base_dir = platforms_util.get_test_output_dir() + output_dir = os.path.join(output_base_dir, output_dir_path) + os.makedirs(output_dir) + tf.logging.info('Outputs of processes will be outputted to: %s' % output_dir) + + args = platforms_util.get_command_to_run_python_module( + 'benchmark_cnn_distributed_test_runner') + args += _convert_params_to_flags_list(params) + if run_distributed: + worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)] + ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)] + controller_ports = [portpicker.pick_unused_port() + for _ in range(num_controllers)] + # The numerator is 0.7 instead of 1 to leave some memory for the Cuda + # runtime, etc. + gpu_memory_frac = 0.7 / num_workers + args += [ + '--gpu_memory_frac_for_testing=%f' % gpu_memory_frac, + '--worker_hosts=' + ','.join('localhost:%d' % p for p in worker_ports) + ] + if num_ps > 0: + ps_hosts_str = ','.join('localhost:%d' % p for p in ps_ports) + args.append('--ps_hosts=' + ps_hosts_str) + else: + controller_host_str = ','.join('localhost:%d' % p + for p in controller_ports) + args.append('--controller_host=' + controller_host_str) + env = os.environ.copy() + # Allow stdout to be viewed before the process ends. + env['PYTHONUNBUFFERED'] = '1' + + worker_processes = [] + ps_processes = [] + controller_processes = [] + try: + for i in range(num_workers): + job_name = 'worker' if run_distributed else '' + process = _create_task_process(job_name, i, args, env, output_dir) + worker_processes.append(process) + # Don't let ps or controller processes use the gpu. + env['CUDA_VISIBLE_DEVICES'] = '' + + for i in range(num_ps): + process = _create_task_process('ps', i, args, env, output_dir) + ps_processes.append(process) + for i in range(num_controllers): + process = _create_task_process('controller', i, args, env, output_dir) + controller_processes.append(process) + # If all distributed all reduce mode is being used, the controller process + # finishes and the worker processes block forever. Otherwise, the worker + # processes finish and the ps processes block forever. We set + # wait_processes and kill_processes accordingly. + if controller_processes: + wait_processes = controller_processes + kill_processes = worker_processes + else: + wait_processes = worker_processes + kill_processes = ps_processes + outputs = _wait_for_processes(wait_processes, kill_processes) + finally: + for process in worker_processes + ps_processes + controller_processes: + try: + process.popen.kill() + except OSError: + pass # It's OK (and expected) if the process already exited. + process.stdout.close() + process.stderr.close() + return [output.splitlines() for output in outputs] + + +# When this test class is run, a method will fail about 0.3% of the time with a +# gRPC error. It is not clear why this occurs. +# TODO(reedwm): Fix this test class. +class TfCnnBenchmarksDistributedTest(tf.test.TestCase): + """Tests running benchmark_cnn in distributed mode.""" + + # We cannot check for a GPU via tf.test.is_gpu_available() before the tests in + # this class because it allocates all the GPU memory which would cause the + # spawned processes to run out of GPU memory. + + def _test_distributed(self, + test_name, + num_workers, + num_ps, + params, + num_controllers=0, + check_output_values=False, + skip=None): + # TODO(reedwm): check_output_values should default to True and be enabled + # on every test. See the TODO in benchmark_cnn_test.py. + def run_fn(run_type, inner_params): + output_dir_path = os.path.join(test_name, run_type) + if run_type == 'Evaluation': + # Distributed evaluation is not supported, so we use a single process. + # We still must spawn another process, because if we evaluate in the + # current process, it would allocate the GPU memory causing future test + # methods to fail. + if inner_params.variable_update == 'distributed_replicated': + inner_params = inner_params._replace(variable_update='replicated') + return _spawn_benchmark_processes( + output_dir_path, num_workers=1, num_ps=0, num_controllers=0, + params=inner_params) + else: + return _spawn_benchmark_processes(output_dir_path, num_workers, num_ps, + num_controllers, inner_params) + + return test_util.train_and_eval(self, run_fn, params, + check_output_values=check_output_values, + skip=skip) + + def testParameterServer(self): + test_name = 'testParameterServer' + params = test_util.get_params(test_name) + self._test_distributed(test_name, 2, 2, params) + + def testParameterServerStaged(self): + test_name = 'testParameterServerStaged' + params = test_util.get_params(test_name)._replace(staged_vars=True) + self._test_distributed(test_name, 2, 2, params) + + def testReplicated(self): + test_name = 'testReplicated' + params = test_util.get_params(test_name)._replace( + variable_update='distributed_replicated') + self._test_distributed(test_name, 2, 2, params) + + def testAllReducePsgpu(self): + test_name = 'testAllReducePsgpu' + flags_dict = test_util.get_params(test_name)._replace( + variable_update='distributed_all_reduce', + all_reduce_spec='psgpu#4') + self._test_distributed(test_name, 2, 0, flags_dict, num_controllers=1) + + def testAllReducePscpuXring(self): + test_name = 'testAllReducePscpuXring' + flags_dict = test_util.get_params(test_name)._replace( + variable_update='distributed_all_reduce', + all_reduce_spec='pscpu:2k:xring') + self._test_distributed(test_name, 2, 0, flags_dict, num_controllers=1) + + def testForwardOnly(self): + test_name = 'testForwardOnly' + params = test_util.get_params(test_name)._replace(forward_only=True) + # Evaluation is not supported with --forward_only, so we set skip='eval'. + self._test_distributed(test_name, 2, 2, params, skip='eval') + + def testSingleWorkerAndPs(self): + test_name = 'testSingleWorkerAndPs' + params = test_util.get_params(test_name) + self._test_distributed(test_name, 1, 1, params) + + def testThreeWorkersAndPses(self): + test_name = 'testThreeWorkersAndPses' + params = test_util.get_params(test_name) + self._test_distributed(test_name, 3, 3, params) + + def testOneWorkerThreePses(self): + test_name = 'testOneWorkerThreePses' + params = test_util.get_params(test_name) + self._test_distributed(test_name, 1, 3, params) + + def testThreeWorkersOnePs(self): + test_name = 'testThreeWorkersOnePs' + params = test_util.get_params(test_name) + self._test_distributed(test_name, 3, 1, params) + + def testNoPrintTrainingAccuracy(self): + test_name = 'testNoPrintTrainingAccuracy' + params = test_util.get_params(test_name)._replace( + print_training_accuracy=False) + self._test_distributed(test_name, 2, 2, params) + + def testRmspropParameterServer(self): + test_name = 'testRmspropParameterServer' + params = test_util.get_params(test_name)._replace(optimizer='rmsprop') + self._test_distributed(test_name, 2, 2, params) + + def testMomentumReplicated(self): + test_name = 'testMomentumReplicated' + params = test_util.get_params(test_name)._replace( + optimizer='momentum', variable_update='distributed_replicated') + self._test_distributed(test_name, 2, 2, params) + + def testNoCrossReplicaSyncParameterServerStaged(self): + test_name = 'testNoCrossReplicaSyncParameterServerStaged' + params = test_util.get_params(test_name)._replace( + staged_vars=True, cross_replica_sync=False) + self._test_distributed(test_name, 2, 2, params) + + def testSingleGpu(self): + test_name = 'testSingleGpu' + params = test_util.get_params(test_name)._replace(num_gpus=1) + self._test_distributed(test_name, 2, 2, params) + + def testBatchGroupSize(self): + test_name = 'testBatchGroupSize' + params = test_util.get_params(test_name)._replace( + batch_group_size=4, num_batches=100, num_warmup_batches=5) + self._test_distributed(test_name, 2, 2, params) + + def testFp16WithFp32Vars(self): + test_name = 'testFp16WithFp32Vars' + params = test_util.get_params(test_name)._replace( + use_fp16=True, fp16_vars=False) + self._test_distributed(test_name, 2, 2, params) + + def testFp16WithFp16Vars(self): + test_name = 'testFp16WithFp16Vars' + params = test_util.get_params(test_name)._replace( + use_fp16=True, fp16_vars=True, fp16_loss_scale=1.) + self._test_distributed(test_name, 2, 2, params) + + def testFp16Replicated(self): + test_name = 'testFp16Replicated' + params = test_util.get_params(test_name)._replace( + use_fp16=True, variable_update='distributed_replicated') + self._test_distributed(test_name, 2, 2, params) + + @unittest.skip('b/147310862: Fails for unknown reason') + def testReplicatedRealData(self): + test_name = 'testReplicatedRealData' + imagenet_dir = os.path.join(platforms_util.get_test_data_dir(), + 'fake_tf_record_data') + params = test_util.get_params(test_name)._replace( + variable_update='distributed_replicated', + data_dir=imagenet_dir, + data_name='imagenet') + self._test_distributed(test_name, 2, 2, params) + + +class DistributedVariableUpdateTest(tf.test.TestCase): + """Tests that variables are updated correctly in distributed mode.""" + + def _test_variable_update(self, + test_name, + num_workers, + num_ps, + params, + num_controllers=0): + """Tests variables are updated correctly when the given params are used.""" + output_dir_path = os.path.join(test_name, 'variable_update') + logs = _spawn_benchmark_processes(output_dir_path, num_workers, num_ps, + num_controllers, params) + actual_losses = [] + for worker_logs in logs: + outputs = test_util.get_training_outputs_from_logs( + worker_logs, params.print_training_accuracy) + actual_losses.append([x.loss for x in outputs]) + + inputs = test_util.get_fake_var_update_inputs() + expected_losses = test_util.TestCNNModel().manually_compute_losses( + inputs, num_workers, params) + if params.variable_update == 'distributed_all_reduce': + # In distributed all reduce, each step, the controller outputs the average + # of the loss from each worker. So we modify expected losses accordingly. + # E.g, we change [[1, 2], [4, 5]] to [[2.5, 3.5]] + expected_losses = [[sum(losses) / num_workers + for losses in zip(*expected_losses)]] + rtol = 3e-2 if params.use_fp16 else 1e-5 + for worker_actual_losses, worker_expected_losses in zip(actual_losses, + expected_losses): + self.assertAllClose(worker_actual_losses[:len(worker_expected_losses)], + worker_expected_losses, rtol=rtol, atol=0.) + + def _test_variable_updates(self, test_name, params): + """Tests variables are updated correctly with various variable updates.""" + + # Unfortunately, distributed parameter server is non-deterministic with + # multiple workers, because one worker may write to a variable before + # another worker reads it. This probably does not harm training, but it + # does mean we cannot easily test that case. So, we use one worker. + self._test_variable_update( + test_name + '_ps', num_workers=1, num_ps=2, num_controllers=0, + params=params._replace(variable_update='parameter_server')) + + self._test_variable_update( + test_name + '_rep', num_workers=2, num_ps=1, num_controllers=0, + params=params._replace(variable_update='distributed_replicated')) + + self._test_variable_update( + test_name + '_allreduce', num_workers=2, num_ps=0, num_controllers=1, + params=params._replace(variable_update='distributed_all_reduce', + all_reduce_spec='psgpu#%d' % params.num_gpus)) + + def testVarUpdateDefault(self): + params = test_util.get_var_update_params() + self._test_variable_updates('testVarUpdateDefault', params) + + def testVarUpdateCpuAsLocalParamDevice(self): + params = test_util.get_var_update_params()._replace( + local_parameter_device='cpu') + self._test_variable_updates('testVarUpdateCpuAsLocalParamDevice', params) + + def testVarUpdateFp16(self): + params = test_util.get_var_update_params()._replace(use_fp16=True) + self._test_variable_updates('testVarUpdateFp16', params) + + def testVarUpdateResourceVars(self): + params = test_util.get_var_update_params()._replace(use_resource_vars=True) + self._test_variable_updates('testVarUpdateResourceVars', params) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test_runner.py b/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..9291a801e4606c2b1982e5e1e0df833227a45e8f --- /dev/null +++ b/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test_runner.py @@ -0,0 +1,122 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Used to run benchmark_cnn for distributed tests. + +In distributed tests, we spawn processes to run tf_cnn_benchmark tasks. We could +directly spawn tf_cnn_benchmark processes, but we want some added functionality, +such as being able to inject custom images during training. So instead, this +file is spawned as a Python process, which supports the added functionality. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import flags as absl_flags +import numpy as np +import tensorflow.compat.v1 as tf +import benchmark_cnn +import flags +import preprocessing +import test_util + + +absl_flags.DEFINE_string('fake_input', 'none', + """What fake input to inject into benchmark_cnn. This + is ignored if --model=test_model. + Options are: + none: Do not inject any fake input. + zeros_and_ones: Half the images will be all 0s with + a label of 0. Half the images will be all 1s with a + label of 1.""") + +flags.define_flags() +FLAGS = flags.FLAGS + + +def get_test_image_preprocessor(batch_size, params): + """Returns the preprocessing.TestImagePreprocessor that should be injected. + + Returns None if no preprocessor should be injected. + + Args: + batch_size: The batch size across all GPUs. + params: BenchmarkCNN's parameters. + Returns: + Returns the preprocessing.TestImagePreprocessor that should be injected. + Raises: + ValueError: Flag --fake_input is an invalid value. + """ + if FLAGS.fake_input == 'none': + return None + elif FLAGS.fake_input == 'zeros_and_ones': + half_batch_size = batch_size // 2 + images = np.zeros((batch_size, 227, 227, 3), dtype=np.float32) + images[half_batch_size:, :, :, :] = 1 + labels = np.array([0] * half_batch_size + [1] * half_batch_size, + dtype=np.int32) + preprocessor = preprocessing.TestImagePreprocessor( + batch_size, [227, 227, 3], params.num_gpus, + benchmark_cnn.get_data_type(params)) + preprocessor.set_fake_data(images, labels) + preprocessor.expected_subset = 'validation' if params.eval else 'train' + return preprocessor + else: + raise ValueError('Invalid --fake_input: %s' % FLAGS.fake_input) + + +def run_with_real_model(params): + """Runs tf_cnn_benchmarks with a real model.""" + bench = benchmark_cnn.BenchmarkCNN(params) + bench.print_info() + preprocessor = get_test_image_preprocessor(bench.batch_size, params) + if preprocessor is not None: + # The test image preprocessor requires queue runners. Since this file is + # used for testing, it is OK to access protected members. + # pylint: disable=protected-access + bench.dataset._queue_runner_required = True + # pylint: enable=protected-access + bench.input_preprocessor = preprocessor + bench.run() + + +def run_with_test_model(params): + """Runs tf_cnn_benchmarks with a test model.""" + model = test_util.TestCNNModel() + inputs = test_util.get_fake_var_update_inputs() + with test_util.monkey_patch(benchmark_cnn, + LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15): + bench = benchmark_cnn.BenchmarkCNN(params, dataset=test_util.TestDataSet(), + model=model) + # The test model does not use labels when computing loss, so the label + # values do not matter as long as it's the right shape. + labels = np.array([1] * inputs.shape[0]) + bench.input_preprocessor.set_fake_data(inputs, labels) + bench.run() + + +def main(_): + params = benchmark_cnn.make_params_from_flags() + params = benchmark_cnn.setup(params) + if params.model == 'test_model': + run_with_test_model(params) + else: + run_with_real_model(params) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.app.run() diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn_test.py b/cv/classification/resnet50/tensorflow/benchmark_cnn_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9e849739c4687e2f53803fdb8d40d9a7e97ccb80 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/benchmark_cnn_test.py @@ -0,0 +1,1493 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for benchmark_cnn.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import glob +import os +import re +import unittest + +import mock +import numpy as np +import tensorflow.compat.v1 as tf +from google.protobuf import text_format +from tensorflow.core.framework import step_stats_pb2 +from tensorflow.core.profiler import tfprof_log_pb2 +from tensorflow.python.platform import test +import benchmark_cnn +import datasets +import flags +import preprocessing +import test_util +import variable_mgr_util +from platforms import util as platforms_util + + +def _check_has_gpu(): + if not test.is_gpu_available(cuda_only=True): + raise ValueError( + """You have asked to run part or all of this on GPU, but it appears + that no GPU is available. If your machine has GPUs it is possible you + do not have a version of TensorFlow with GPU support. To build with GPU + support, add --config=cuda to the build flags.\n """) + + +class TfCnnBenchmarksModelTest(tf.test.TestCase): + """Tests which are run with multiple models.""" + + def setUp(self): + super(TfCnnBenchmarksModelTest, self).setUp() + benchmark_cnn.setup(benchmark_cnn.make_params()) + + def get_model_name(self): + return None + + # Return true to run tests that don't need to be run on every model. + # This should be done for one or two cheap models. + def extended_tests(self): + return False + + # Return false to suppress actually running the model; this is useful + # for tests that are large. + def model_execution_test(self): + return False + + # Return false to suppress actually saving and loading the model. + def model_save_load_test(self): + return False + + def testSaveLoadModel(self): + _check_has_gpu() + if not self.get_model_name() or not self.model_save_load_test(): + return + + params = benchmark_cnn.make_params( + model=self.get_model_name(), + num_batches=1, + num_intra_threads=0, + num_inter_threads=0, + distortions=False, + batch_size=2, + variable_update='replicated', + num_warmup_batches=0, + num_gpus=2, + train_dir=test_util.get_temp_dir('testSaveLoadModel_' + + self.get_model_name())) + + # Run one batch and save the model. + # Note that this uses a non-test session. + bench = benchmark_cnn.BenchmarkCNN(params) + bench.run() + self.assertEqual(bench.init_global_step, 0) + # Clear the default graph. + tf.reset_default_graph() + # Test if checkpoint had been saved. + ckpt = tf.train.get_checkpoint_state(params.train_dir) + match = re.match(os.path.join(params.train_dir, r'model.ckpt-(\d+).index'), + ckpt.model_checkpoint_path + '.index') + self.assertTrue(match) + self.assertGreaterEqual(int(match.group(1)), params.num_batches) + params = params._replace(num_batches=2) + # Reload the model + bench = benchmark_cnn.BenchmarkCNN(params) + bench.run() + # Check if global step has been restored. + self.assertNotEqual(bench.init_global_step, 0) + ckpt = tf.train.get_checkpoint_state(params.train_dir) + match = re.match(os.path.join(params.train_dir, r'model.ckpt-(\d+).index'), + ckpt.model_checkpoint_path + '.index') + self.assertTrue(match) + self.assertGreaterEqual(int(match.group(1)), params.num_batches) + # Check that the batch norm moving averages are restored from checkpoints + with tf.Graph().as_default(): + bench = benchmark_cnn.BenchmarkCNN(params) + bench._build_model() + saver = tf.train.Saver(bench.variable_mgr.savable_variables()) + with tf.Session(config=benchmark_cnn.create_config_proto(params)) as sess: + benchmark_cnn.load_checkpoint(saver, sess, params.train_dir) + sess.run(bench.variable_mgr.get_post_init_ops()) + bn_moving_vars = [ + v for v in tf.global_variables() + if '/batchnorm' in v.name and '/moving' in v.name + ] + self.assertGreater(len(bn_moving_vars), 0) + for moving_var in bn_moving_vars: + moving_var_value = sess.run(moving_var) + # Check that the moving means and moving variances have been restored + # by asserting they are not their default values of 0 and 1, + # respectively + if '/moving_mean' in moving_var.name: + self.assertFalse(np.array_equal(moving_var_value, + np.zeros(moving_var_value.shape, + moving_var_value.dtype))) + else: + self.assertIn('/moving_variance', moving_var.name) + self.assertFalse(np.array_equal(moving_var_value, + np.ones(moving_var_value.shape, + moving_var_value.dtype))) + + def testModel(self): + _check_has_gpu() + if not self.get_model_name() or not self.model_execution_test(): + return + + params = benchmark_cnn.make_params( + model=self.get_model_name(), + num_batches=1, + num_intra_threads=1, + num_inter_threads=12, + batch_size=2, + distortions=False) + + # Run this one; note that this uses a non-test session. + bench = benchmark_cnn.BenchmarkCNN(params) + bench.run() + + def testSendRecvVariables(self): + self._testVariables('parameter_server') + if self.extended_tests(): + self._testVariables('parameter_server', local_parameter_device='CPU') + self._testVariables('parameter_server', optimizer='sgd') + + def testReplicatedVariables(self): + self._testVariables('replicated') + if self.extended_tests(): + self._testVariables('replicated', all_reduce_spec=None) + self._testVariables('replicated', use_fp16=True, fp16_vars=False) + self._testVariables( + 'replicated', + all_reduce_spec=None, + use_fp16=True, + fp16_vars=False, + fp16_enable_auto_loss_scale=True, + fp16_inc_loss_scale_every_n=4) + + def testIndependentVariables(self): + self._testVariables('independent') + self._testVariables( + 'independent', + all_reduce_spec=None, + use_fp16=True, + fp16_vars=False, + fp16_enable_auto_loss_scale=True, + fp16_inc_loss_scale_every_n=4) + + def testSummaryVerbosity(self): + self._testVariables('parameter_server', summary_verbosity=1) + if self.extended_tests(): + self._testVariables('parameter_server', summary_verbosity=2) + self._testVariables('parameter_server', summary_verbosity=3) + + def testStagedVariables(self): + self._testVariables('parameter_server', staged_vars=True) + if self.extended_tests(): + self._testVariables('parameter_server', staged_vars=True, + local_parameter_device='CPU') + self._testVariables('parameter_server', staged_vars=True, use_fp16=True, + fp16_vars=True) + + def _assert_correct_var_type(self, var, params): + if 'gpu_cached_inputs' not in var.name: + if params.use_fp16 and params.fp16_vars and 'batchnorm' not in var.name: + expected_type = tf.float16 + else: + expected_type = tf.float32 + self.assertEqual(var.dtype.base_dtype, expected_type) + + def _testVariables(self, + variable_update, + summary_verbosity=0, + local_parameter_device='GPU', + staged_vars=False, + optimizer='momentum', + # TODO(b/80125832): Enable nccl in tests + # all_reduce_spec='nccl', + all_reduce_spec='', + use_fp16=False, + fp16_vars=False, + fp16_enable_auto_loss_scale=False, + fp16_inc_loss_scale_every_n=10): + if not self.get_model_name(): + return + _check_has_gpu() + + params = benchmark_cnn.make_params( + model=self.get_model_name(), + num_batches=1, + num_intra_threads=1, + num_inter_threads=12, + distortions=False, + variable_update=variable_update, + local_parameter_device=local_parameter_device, + num_gpus=2, + summary_verbosity=summary_verbosity, + staged_vars=staged_vars, + optimizer=optimizer, + all_reduce_spec=all_reduce_spec, + compact_gradient_transfer=False if all_reduce_spec == 'nccl' else True, + use_fp16=use_fp16, + fp16_loss_scale=2., + fp16_vars=fp16_vars, + fp16_enable_auto_loss_scale=fp16_enable_auto_loss_scale, + fp16_inc_loss_scale_every_n=fp16_inc_loss_scale_every_n, + ) + + # Test building models using multiple GPUs, but don't + # run them. + with self.test_session(graph=tf.Graph()): + bench = benchmark_cnn.BenchmarkCNN(params) + bench._build_model() + + # Rough validation of variable type and placement, depending on mode. + all_vars = tf.global_variables() + tf.local_variables() + if params.variable_update == 'parameter_server': + for v in all_vars: + tf.logging.debug('var: %s' % v.name) + match = re.match(r'tower_(\d+)/v/gpu_cached_inputs:0', v.name) + if match: + self.assertEqual(v.device, '/device:GPU:%s' % match.group(1)) + elif v.name.startswith('v/'): + self.assertEqual(v.device, '/device:%s:0' % local_parameter_device) + self._assert_correct_var_type(v, params) + elif v.name in ('input_processing/images:0', + 'input_processing/labels:0', 'init_learning_rate:0', + 'global_step:0', 'loss_scale:0', + 'loss_scale_normal_steps:0'): + self.assertEqual(v.device, '/device:CPU:0') + else: + raise ValueError('Unexpected variable %s' % v.name) + else: + v0_count = 0 + v1_count = 0 + for v in all_vars: + if v.name.startswith('tower_0/v0/'): + self.assertEqual(v.name, 'tower_0/v0/gpu_cached_inputs:0') + self.assertEqual(v.device, '/device:GPU:0') + elif v.name.startswith('tower_1/v1/'): + self.assertEqual(v.name, 'tower_1/v1/gpu_cached_inputs:0') + self.assertEqual(v.device, '/device:GPU:1') + elif v.name.startswith('v0/'): + v0_count += 1 + self.assertEqual(v.device, '/device:GPU:0') + self._assert_correct_var_type(v, params) + elif v.name.startswith('v1/'): + v1_count += 1 + self.assertEqual(v.device, '/device:GPU:1') + self._assert_correct_var_type(v, params) + elif v.name in ('input_processing/images:0', + 'input_processing/labels:0', 'init_learning_rate:0', + 'global_step:0', 'loss_scale:0', + 'loss_scale_normal_steps:0'): + self.assertEqual(v.device, '/device:CPU:0') + else: + raise ValueError('Unexpected variable %s' % v.name) + self.assertEqual(v0_count, v1_count) + + # Validate summary ops in the model depending on verbosity level + summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES) + num_summary_ops = len(summary_ops) + self.assertEqual(num_summary_ops > 0, summary_verbosity > 0) + if summary_verbosity > 0: + has_affine_histogram = False + has_gradient_histogram = False + has_log_gradients_histogram = False + for op in summary_ops: + if '/gradients' in op.name: + has_gradient_histogram = True + elif '/affine' in op.name: + has_affine_histogram = True + elif 'log_gradients' in op.name: + has_log_gradients_histogram = True + self.assertEqual(summary_verbosity >= 3, has_affine_histogram) + self.assertEqual(summary_verbosity >= 3, has_gradient_histogram) + self.assertEqual(summary_verbosity >= 2, has_log_gradients_histogram) + if summary_verbosity == 1: + self.assertLess(num_summary_ops, 10) + + +class TrivialModelTest(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'trivial' + + +class TestVgg1Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'vgg11' + + +class TestVgg19Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'vgg19' + + +class TestLenet5Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'lenet' + + +class TestGooglenetModel(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'googlenet' + + +class TestOverfeatModel(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'overfeat' + + +class TestAlexnetModel(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'alexnet' + + def extended_tests(self): + return True + + +class TestTrivialModel(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'trivial' + + +class TestInceptionv3Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'inception3' + + def extended_tests(self): + return True + + +class TestInceptionv4Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'inception4' + + +class TestResnet50Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'resnet50' + + def model_save_load_test(self): + return True + + +class TestResnet101Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'resnet101' + + +class TestResnet152Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'resnet152' + + +class TestResnet50V2Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'resnet50_v2' + + +class TestResnet101V2Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'resnet101_v2' + + +class TestResnet152V2Model(TfCnnBenchmarksModelTest): + + def get_model_name(self): + return 'resnet152_v2' + + +class TfCnnBenchmarksTest(tf.test.TestCase): + """Tests that benchmark_cnn runs correctly.""" + + def setUp(self): + super(TfCnnBenchmarksTest, self).setUp() + _check_has_gpu() + benchmark_cnn.setup(benchmark_cnn.make_params()) + + def _run_benchmark_cnn(self, params): + logs = [] + benchmark_cnn.log_fn = test_util.print_and_add_to_list(logs) + benchmark_cnn.BenchmarkCNN(params).run() + return logs + + def _run_benchmark_cnn_with_fake_images(self, params, images, labels): + logs = [] + benchmark_cnn.log_fn = test_util.print_and_add_to_list(logs) + bench = benchmark_cnn.BenchmarkCNN(params) + bench.input_preprocessor = preprocessing.TestImagePreprocessor( + params.batch_size * params.num_gpus, + [[params.batch_size, 227, 227, 3], [params.batch_size]], + params.num_gpus, + bench.model.data_type) + bench.dataset._queue_runner_required = True + bench.input_preprocessor.set_fake_data(images, labels) + bench.input_preprocessor.expected_subset = ('validation' + if params.eval else 'train') + bench.run() + return logs + + def _run_benchmark_cnn_with_black_and_white_images(self, params): + """Runs BenchmarkCNN with black and white images. + + A BenchmarkCNN is created and run with black and white images as input. Half + the images are black (i.e., filled with 0s) and half are white (i.e., filled + with 255s). + + Args: + params: Params for BenchmarkCNN. + + Returns: + A list of lines from the output of BenchmarkCNN. + """ + # TODO(reedwm): Instead of generating images here, use black and white + # tfrecords by calling test_util.create_black_and_white_images(). + effective_batch_size = params.batch_size * params.num_gpus + half_batch_size = effective_batch_size // 2 + images = np.zeros((effective_batch_size, 227, 227, 3), dtype=np.float32) + images[half_batch_size:, :, :, :] = 255 + labels = np.array([0] * half_batch_size + [1] * half_batch_size, + dtype=np.int32) + return self._run_benchmark_cnn_with_fake_images(params, images, labels) + + def _train_and_eval_local(self, + params, + check_output_values=False, + max_final_loss=10., + skip=None, + use_test_preprocessor=True): + # TODO(reedwm): check_output_values should default to True and be enabled + # on every test. Currently, if check_output_values=True and the calls to + # tf.set_random_seed(...) and np.seed(...) are passed certain seed values in + # benchmark_cnn.py, then most tests will fail. This indicates the tests + # are brittle and could fail with small changes when + # check_output_values=True, so check_output_values defaults to False for + # now. + + def run_fn(run_type, inner_params): + del run_type + if use_test_preprocessor: + return [ + self._run_benchmark_cnn_with_black_and_white_images(inner_params) + ] + else: + return [self._run_benchmark_cnn(inner_params)] + + return test_util.train_and_eval(self, run_fn, params, + check_output_values=check_output_values, + max_final_loss=max_final_loss, + skip=skip) + + def testAlexnet(self): + params = test_util.get_params('testAlexnet')._replace( + num_batches=30, init_learning_rate=0.01, model='alexnet') + self._train_and_eval_local(params) + + def testNoPrintAccuracy(self): + params = test_util.get_params('testNoPrintAccuracy')._replace( + print_training_accuracy=False) + self._train_and_eval_local(params) + + def testLowAccuracy(self): + params = test_util.get_params('testLowAccuracy')._replace( + print_training_accuracy=True, batch_size=5, num_batches=10) + # We force low accuracy by having each batch containing 10 identical images, + # each with a different label. This guarantees a top-1 accuracy of exactly + # 0.1 and a top-5 accuracy of exactly 0.5. + images = np.zeros((10, 227, 227, 3), dtype=np.float32) + labels = np.arange(10, dtype=np.int32) + logs = self._run_benchmark_cnn_with_fake_images(params, images, labels) + training_outputs = test_util.get_training_outputs_from_logs( + logs, params.print_training_accuracy) + last_output = training_outputs[-1] + # TODO(reedwm): These should be assertEqual but for some reason, + # occasionally the accuracies are lower (Running this test 500 times, these + # asserts failed twice). Investigate this problem. + self.assertLessEqual(last_output.top_1_accuracy, 0.1) + self.assertLessEqual(last_output.top_5_accuracy, 0.5) + + def testParameterServer(self): + params = test_util.get_params('testParameterServer') + self._train_and_eval_local(params) + + def testParameterServerStaged(self): + params = test_util.get_params('testParameterServerStaged')._replace( + staged_vars=True) + self._train_and_eval_local(params) + + def testReplicated(self): + params = test_util.get_params('testReplicated')._replace( + variable_update='replicated') + self._train_and_eval_local(params) + + def testIndependent(self): + params = test_util.get_params('testIndependent')._replace( + variable_update='independent') + self._train_and_eval_local(params) + + def testForwardOnly(self): + params = test_util.get_params('testForwardOnly')._replace(forward_only=True) + # Evaluation is not supported with --forward_only, so we set skip='eval'. + self._train_and_eval_local(params, skip='eval') + + def testForwardOnlyAndFreeze(self): + params = test_util.get_params('testForwardOnlyAndFreeze')._replace( + forward_only=True, freeze_when_forward_only=True, train_dir=None) + # Training is not supported with --freeze_when_forward_only. + self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint') + + def testNoDistortions(self): + params = test_util.get_params('testNoDistortions')._replace( + distortions=False) + self._train_and_eval_local(params) + + def testCpuAsLocalParamDevice(self): + params = test_util.get_params('testCpuAsLocalParamDevice')._replace( + local_parameter_device='cpu') + self._train_and_eval_local(params) + + def testNHWC(self): + params = test_util.get_params('testNHWC')._replace(data_format='NHWC') + self._train_and_eval_local(params) + + def testCpuAsDevice(self): + params = test_util.get_params('testCpuAsDevice')._replace( + device='cpu', data_format='NHWC') # NHWC required when --device=cpu + self._train_and_eval_local(params) + + def testMomentumParameterServer(self): + params = test_util.get_params('testMomentumParameterServer')._replace( + optimizer='momentum', momentum=0.8) + self._train_and_eval_local(params) + + def testRmspropReplicated(self): + params = test_util.get_params('testRmspropReplicated')._replace( + variable_update='replicated', + optimizer='rmsprop', + rmsprop_decay=0.8, + rmsprop_momentum=0.6, + rmsprop_epsilon=0.7, + init_learning_rate=0.01) + self._train_and_eval_local(params) + + def testBatchGroupSize(self): + params = test_util.get_params('testBatchGroupSize')._replace( + batch_group_size=4, num_batches=100, num_warmup_batches=5) + self._train_and_eval_local(params) + + def testGradientClip(self): + params = test_util.get_params('testGradientClip')._replace( + gradient_clip=100.0) + self._train_and_eval_local(params) + + def testWeightDecay(self): + params = test_util.get_params('testWeightDecay')._replace( + weight_decay=0.0001) + self._train_and_eval_local(params) + + def testNoLayers(self): + params = test_util.get_params('testNoLayers')._replace(use_tf_layers=False) + self._train_and_eval_local(params) + + def testSaveModelSteps(self): + params = test_util.get_params('testSaveModelSteps')._replace( + save_model_steps=2, num_warmup_batches=0, num_batches=10, + max_ckpts_to_keep=3) + self._train_and_eval_local(params) + for i in range(1, 20 + 1): + # We train for 20 steps, since self._train_and_eval_local() does two + # training runs of 10 steps each. We save a checkpoint every 2 steps and + # keep the last 3 checkpoints, so at the end, we should have checkpoints + # for steps 16, 18, and 20. + matches = glob.glob(os.path.join(params.train_dir, + 'model.ckpt-{}.*'.format(i))) + if i in (16, 18, 20): + self.assertTrue(matches) + else: + self.assertFalse(matches) + + def testFp16WithFp32Vars(self): + params = test_util.get_params('testFp16WithFp32Vars')._replace( + use_fp16=True, fp16_vars=False, fp16_loss_scale=1.) + self._train_and_eval_local(params) + + def testFp16WithFp16Vars(self): + params = test_util.get_params('testFp16WithFp16Vars')._replace( + use_fp16=True, fp16_vars=True) + self._train_and_eval_local(params) + + def testXlaCompile(self): + params = test_util.get_params('testXlaCompile')._replace(xla_compile=True) + self._train_and_eval_local(params) + + @unittest.skip('Fails for unknown reason') + def testXlaCompileWithFp16(self): + params = test_util.get_params('testXlaCompileWithFp16')._replace( + use_fp16=True, xla_compile=True) + self._train_and_eval_local(params) + + def testGradientRepacking(self): + params = test_util.get_params('testGradientRepacking1')._replace( + gradient_repacking=2) + self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint') + params = test_util.get_params('testGradientRepacking2')._replace( + gradient_repacking=2, use_fp16=True) + self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint') + + def testTraceFileChromeTraceFormat(self): + trace_file = os.path.join(self.get_temp_dir(), + 'testTraceFileChromeTraceFormat_tracefile') + params = test_util.get_params('testTraceFileChromeTraceFormat')._replace( + trace_file=trace_file, use_chrome_trace_format=True) + self._train_and_eval_local(params) + self.assertGreater(os.stat(trace_file).st_size, 0) + + def testTraceFileStepStatsProto(self): + trace_file = os.path.join(self.get_temp_dir(), + 'testTraceFileStepStatsProto_tracefile') + params = test_util.get_params('testTraceFileStepStatsProto')._replace( + trace_file=trace_file, use_chrome_trace_format=False) + self._train_and_eval_local(params) + self.assertGreater(os.stat(trace_file).st_size, 0) + with open(trace_file) as f: + step_stats = step_stats_pb2.StepStats() + # The following statement should not raise an exception. + contents = f.read() + text_format.Merge(contents, step_stats) + + def testTfprofFile(self): + tfprof_file = os.path.join(self.get_temp_dir(), 'testTfprofFile_tfproffile') + params = test_util.get_params('testTfprofFile')._replace( + tfprof_file=tfprof_file) + self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint') + self.assertGreater(os.stat(tfprof_file).st_size, 0) + with open(tfprof_file, 'rb') as f: + profile_proto = tfprof_log_pb2.ProfileProto() + # The following statement should not raise an exception. + profile_proto.ParseFromString(f.read()) + + @unittest.skip('Fails for unknown reason') + def testMoveTrainDir(self): + params = test_util.get_params('testMoveTrainDir') + self._train_and_eval_local(params) + new_train_dir = params.train_dir + '_moved' + os.rename(params.train_dir, new_train_dir) + params = params._replace(train_dir=new_train_dir, eval=True) + self._run_benchmark_cnn_with_black_and_white_images(params) + + @mock.patch('tensorflow.compat.v1.train.Saver') + @mock.patch('benchmark_cnn._get_checkpoint_to_load') + def testLoadCheckpoint(self, mock_checkpoint_to_load, mock_saver): + """Tests load checkpoint with full path to checkpoint.""" + expected_checkpoint = '/path/to/checkpoints/model.ckpt-1243' + mock_checkpoint_to_load.return_value = expected_checkpoint + + global_batch = benchmark_cnn.load_checkpoint(mock_saver, + None, + expected_checkpoint) + self.assertEqual(global_batch, 1243) + + def testGetCheckpointToLoadFullPath(self): + """Tests passing full path.""" + ckpt_path = '/foo/bar/model.ckpt-189' + full_path = benchmark_cnn._get_checkpoint_to_load(ckpt_path) + self.assertEqual(full_path, ckpt_path) + + def testGetCheckpointToLoadException(self): + """Tests exception for directory without a checkpoint.""" + ckpt_path = '/foo/bar/checkpoints' + self.assertRaises(benchmark_cnn.CheckpointNotFoundException, + benchmark_cnn._get_checkpoint_to_load, ckpt_path) + + @mock.patch('tensorflow.compat.v1.train.get_checkpoint_state') + def testGetCheckpointToLoad(self, mock_checkpoint_state): + """Tests passing path to checkpoint folder.""" + expected_checkpoint = '/path/to/checkpoints/model.ckpt-1243' + mock_checkpoint_state.return_value = mock.Mock( + model_checkpoint_path=expected_checkpoint) + ckpt_path = '/path/to/checkpoints/' + full_path = benchmark_cnn._get_checkpoint_to_load(ckpt_path) + self.assertEqual(full_path, expected_checkpoint) + + def testImagenetPreprocessor(self): + imagenet_dir = os.path.join(platforms_util.get_test_data_dir(), + 'fake_tf_record_data') + params = test_util.get_params('testImagenetPreprocessor')._replace( + data_dir=imagenet_dir, data_name='imagenet') + self._train_and_eval_local(params, use_test_preprocessor=False) + + def testImagenetPreprocessorNoDistortions(self): + imagenet_dir = os.path.join(platforms_util.get_test_data_dir(), + 'fake_tf_record_data') + params = test_util.get_params( + 'testImagenetPreprocessorNoDistortions')._replace( + data_dir=imagenet_dir, data_name='imagenet', distortions=False) + self._train_and_eval_local(params, use_test_preprocessor=False) + + def testImagenetPreprocessorVerboseSummary(self): + imagenet_dir = os.path.join(platforms_util.get_test_data_dir(), + 'fake_tf_record_data') + params = test_util.get_params( + 'testImagenetPreprocessorVerboseSummary')._replace( + data_dir=imagenet_dir, data_name='imagenet', distortions=False, + summary_verbosity=2) + self._train_and_eval_local(params, use_test_preprocessor=False) + + def testCifar10SyntheticData(self): + params = test_util.get_params('testCifar10SyntheticData')._replace( + data_name='cifar10') + self._train_and_eval_local(params) + + def testShiftRatio(self): + test_util.monkey_patch_base_cluster_manager() + params = benchmark_cnn.make_params( + data_name='imagenet', + data_dir=os.path.join(platforms_util.get_test_data_dir(), + 'fake_tf_record_data'), + job_name='worker', + worker_hosts='w1,w2,w3,w4', + ps_hosts='p1', + task_index=0) + self.assertEqual( + benchmark_cnn.BenchmarkCNN(params).input_preprocessor.shift_ratio, 0.0) + params = params._replace(task_index=3) + self.assertEqual( + benchmark_cnn.BenchmarkCNN(params).input_preprocessor.shift_ratio, 0.75) + + def testDistributedReplicatedSavableVars(self): + test_util.monkey_patch_base_cluster_manager() + params = benchmark_cnn.make_params( + variable_update='distributed_replicated', + model='inception4', + data_name='imagenet', + data_dir=os.path.join(platforms_util.get_test_data_dir(), + 'fake_tf_record_data'), + job_name='worker', + worker_hosts='w1,w2,w3,w4', + ps_hosts='p1', + datasets_use_prefetch=False) + + bench = benchmark_cnn.BenchmarkCNN(params) + with tf.Graph().as_default(): + bench._build_model() + savable_vars = bench.variable_mgr.savable_variables() + # Assert all global variables are in savable_vars + for v in tf.global_variables(): + if not v.name.startswith( + variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'): + self.assertEqual(v.name, 'global_step:0') + name = bench.variable_mgr._strip_port(v.name) + if name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX): + name = name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):] + self.assertIn(name, savable_vars) + self.assertIn(savable_vars[name], tf.global_variables()) + # Assert all local variables on the first tower are in savable_vars + for v in tf.local_variables(): + if v.name.startswith('v0/'): + name = bench.variable_mgr._strip_port(v.name) + self.assertIn(name, savable_vars) + + def _test_preprocessing_eval(self, image_height, image_width, output_height, + output_width): + image = tf.fill((image_height, image_width, 3), + tf.constant(128, dtype=tf.uint8)) + params = benchmark_cnn.make_params() + new_image = preprocessing.eval_image(image, output_height, output_width, 0, + 'bilinear', params.summary_verbosity) + with self.test_session() as sess: + new_image_value = sess.run(new_image) + self.assertAllEqual(new_image_value, + np.full((output_height, output_width, 3), 128, + dtype=np.uint8)) + + def testPreprocessingEval(self): + self._test_preprocessing_eval(10, 10, 4, 4) + self._test_preprocessing_eval(4, 4, 10, 10) + self._test_preprocessing_eval(1, 100, 100, 1) + self._test_preprocessing_eval(100, 1, 1, 100) + self._test_preprocessing_eval(1, 100, 1, 100) + + def _test_preprocessing_traing(self, image_buf, image_color, + output_height, output_width, bbox, + batch_position, resize_method, distortions, + summary_verbosity, fuse_decode_and_crop): + new_image = preprocessing.train_image( + image_buf, + output_height, + output_width, + bbox, + batch_position, + resize_method, + distortions, + summary_verbosity=summary_verbosity, + fuse_decode_and_crop=fuse_decode_and_crop) + self.assertEqual(new_image.shape, [output_height, output_width, 3]) + with self.test_session(use_gpu=True) as sess: + new_image_value = sess.run(new_image) + self.assertAllClose( + new_image_value, + np.full( + [output_height, output_width, 3], + image_color, + dtype=np.float32), + atol=50., + rtol=0.) + + def testPreprocessingTrain(self): + test_data_dir = os.path.join(platforms_util.get_test_data_dir(), 'images') + black_file = os.path.join(test_data_dir, 'black_image.jpg') + with open(black_file, 'rb') as f: + black_jpg_buffer = f.read() + white_file = os.path.join(test_data_dir, 'white_image.jpg') + with open(white_file, 'rb') as f: + white_jpg_buffer = f.read() + bbox = tf.zeros((1, 0, 4), dtype=tf.float32) + batch_position = 0 + # Each size config is (output_height, output_width, resize_method) + size_configs = [(100, 100, 'round_robin'), (150, 10, 'bilinear'), + (10, 150, 'nearest')] + # Each image config is (image_buf, image_color) + image_configs = [(white_jpg_buffer, 255), (black_jpg_buffer, 0)] + for (image_buf, image_color) in image_configs: + for output_height, output_width, resize_method in size_configs: + for distortions in [True, False]: + for summary_verbosity in [0, 2]: + for fuse_decode_and_crop in [True, False]: + self._test_preprocessing_traing( + image_buf, image_color, output_height, output_width, bbox, + batch_position, resize_method, distortions, summary_verbosity, + fuse_decode_and_crop) + + def _test_learning_rate(self, params, global_step_to_expected_learning_rate): + self.longMessage = True # pylint: disable=invalid-name + bench = benchmark_cnn.BenchmarkCNN(params) + with tf.Graph().as_default() as graph: + bench._build_model() + global_step = graph.get_tensor_by_name('global_step:0') + learning_rate = graph.get_tensor_by_name('learning_rate_tensor:0') + with self.test_session(graph=graph, use_gpu=True) as sess: + items = global_step_to_expected_learning_rate.items() + for global_step_val, expected_learning_rate in items: + self.assertAlmostEqual(sess.run(learning_rate, + {global_step: global_step_val}), + expected_learning_rate, + msg='at global_step:{}'. + format(global_step_val)) + + def testLearningRateModelSpecificResNet(self): + params = benchmark_cnn.make_params(model='resnet50', + batch_size=256, + variable_update='parameter_server', + num_gpus=1) + self._test_learning_rate(params, { + 0: 0, + 150136: 0.128, + 150137: 0.0128, + 300273: 0.0128, + 300274: 0.00128, + 10000000: 0.0000128 + }) + + def testLearningRateUserProvidedInitLr(self): + params = benchmark_cnn.make_params(model='resnet50', + batch_size=256, + variable_update='replicated', + init_learning_rate=1.) + self._test_learning_rate(params, { + 0: 1., + 10000000: 1. + }) + + def testLearningRateUserProvidedInitLrAndWarmup(self): + params = benchmark_cnn.make_params(model='resnet50', + batch_size=256, + variable_update='replicated', + init_learning_rate=1., + num_learning_rate_warmup_epochs=5) + self._test_learning_rate(params, { + 0: 0., + 12511: 0.5, + 25022: 1., + 10000000: 1. + }) + + def testLearningRateUserProvidedDecayInfo(self): + params = benchmark_cnn.make_params(model='resnet50', + init_learning_rate=1., + learning_rate_decay_factor=0.5, + num_epochs_per_decay=2, + minimum_learning_rate=0.3750, + batch_size=32) + self._test_learning_rate(params, { + 0: 1., + 80071: 1., + 80072: 0.5, + 160143: 0.5, + 160144: 0.375, + 10000000: 0.375 + }) + + def testLearningRateUserProvidedZeroDecay(self): + params = benchmark_cnn.make_params(model='resnet50', + num_learning_rate_warmup_epochs=0, + learning_rate_decay_factor=0.5, + num_epochs_per_decay=0, + minimum_learning_rate=0.3750, + batch_size=32) + with self.assertRaises(ValueError): + with tf.Graph().as_default(): + # This will fail because params.learning_rate_decay_factor cannot be + # nonzero if params.num_epochs_per_decay is zero. + benchmark_cnn.BenchmarkCNN(params)._build_model() + + def testLearningRateUserProvidedSchedule(self): + params = benchmark_cnn.make_params( + model='trivial', + batch_size=32, + piecewise_learning_rate_schedule='1;3;.1;5;.01') + self._test_learning_rate(params, { + 0: 1., + 120108: 1., + 120109: 0.1, + 200181: 0.1, + 200182: 0.01, + 100000000: 0.01 + }) + + def testNumBatchesAndEpochs(self): + params = benchmark_cnn.make_params() + batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 10, 100) + self.assertEqual(batches, benchmark_cnn._DEFAULT_NUM_BATCHES) + self.assertAlmostEqual(epochs, + float(benchmark_cnn._DEFAULT_NUM_BATCHES) / 10) + + params = benchmark_cnn.make_params(num_batches=21) + batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 25, 50) + self.assertEqual(batches, 21) + self.assertAlmostEqual(epochs, 10.5) + + params = benchmark_cnn.make_params(num_epochs=3) + batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 2, 3) + self.assertEqual(batches, 5) + self.assertAlmostEqual(epochs, 10./3.) + + params = benchmark_cnn.make_params(num_epochs=4) + batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 2, 3) + self.assertEqual(batches, 6) + self.assertAlmostEqual(epochs, 4) + + with self.assertRaises(ValueError): + params = benchmark_cnn.make_params(num_batches=100, num_epochs=100) + benchmark_cnn.get_num_batches_and_epochs(params, 1, 1) + + def _testEvalDuringTraining(self, params, expected_num_eval_batches_found): + # The idea of this test is that all train images are black and all eval + # images are white. We pass the images through the TestModel, and ensure + # the outputs are as expected. + + batch_size = params.batch_size + eval_batch_size = params.eval_batch_size or params.batch_size + + class TestModel(test_util.TestCNNModel): + + def __init__(self): + super(TestModel, self).__init__() + self.depth = 3 + + def add_inference(self, cnn): + if cnn.phase_train: + # This will allow us to test that 100 is only added during training + # and not during eval. + cnn.top_layer += 100 + assert cnn.top_layer.shape[0] == batch_size + else: + assert cnn.top_layer.shape[0] == eval_batch_size + + # Reduce the image to a single number. The number should be (-1 + 100) + # during training and 1 during testing. + cnn.top_layer = tf.reshape(cnn.top_layer, (cnn.top_layer.shape[0], -1)) + cnn.top_layer = tf.reduce_mean(cnn.top_layer, axis=1) + cnn.top_layer = tf.reshape(cnn.top_layer, + (cnn.top_layer.shape[0], 1, 1, 1)) + cnn.top_size = 1 + trainable_vars = tf.trainable_variables() + + # The super method will compute image*A*B, where A=1 and B=2. + super(TestModel, self).add_inference(cnn) + + if not cnn.phase_train: + # Assert no new variables were added, since they should be reused from + # training. + assert len(trainable_vars) == len(tf.trainable_variables()) + + model = TestModel() + dataset = datasets.ImagenetDataset(params.data_dir) + logs = [] + bench_cnn = benchmark_cnn.BenchmarkCNN(params, model=model, dataset=dataset) + with test_util.monkey_patch(benchmark_cnn, + log_fn=test_util.print_and_add_to_list(logs)): + bench_cnn.run() + training_outputs = test_util.get_training_outputs_from_logs( + logs, print_training_accuracy=False) + self.assertEqual(len(training_outputs), params.num_batches) + expected_training_output = (-1 + 100) * 1 * 2 + for training_output in training_outputs: + self.assertEqual(training_output.loss, expected_training_output) + eval_outputs = test_util.get_evaluation_outputs_from_logs(logs) + self.assertTrue(eval_outputs) + expected_eval_output = 1 * 1 * 2 + for eval_output in eval_outputs: + self.assertEqual(eval_output.top_1_accuracy, expected_eval_output) + self.assertEqual(eval_output.top_5_accuracy, expected_eval_output) + + num_eval_batches_found = 0 + eval_batch_regex = re.compile(r'^\d+\t[0-9.]+ examples/sec$') + for log in logs: + if eval_batch_regex.match(log): + num_eval_batches_found += 1 + self.assertEqual(num_eval_batches_found, expected_num_eval_batches_found) + + def testEvalDuringTraining(self): + data_dir = test_util.create_black_and_white_images() + base_params = test_util.get_params('testEvalDuringTraining') + train_dir = base_params.train_dir + base_params = base_params._replace( + train_dir=None, print_training_accuracy=False, num_warmup_batches=0, + num_batches=7, num_eval_batches=2, display_every=1, + init_learning_rate=0, weight_decay=0, + distortions=False, data_dir=data_dir) + expected_num_eval_batches_found = ( + base_params.num_eval_batches * (base_params.num_batches // 2 + 1)) + + # Test --eval_during_training_every_n_steps + self._testEvalDuringTraining( + base_params._replace(eval_during_training_every_n_steps=2, + variable_update='parameter_server'), + expected_num_eval_batches_found) + self._testEvalDuringTraining( + base_params._replace(eval_during_training_every_n_steps=2, + variable_update='replicated'), + expected_num_eval_batches_found) + self._testEvalDuringTraining( + base_params._replace(eval_during_training_every_n_steps=2, + variable_update='replicated', + summary_verbosity=2, + save_summaries_steps=2, + datasets_use_prefetch=False), + expected_num_eval_batches_found) + self._testEvalDuringTraining( + base_params._replace(eval_during_training_every_n_steps=2, + variable_update='replicated', + use_fp16=True, train_dir=train_dir, + eval_batch_size=base_params.batch_size + 2), + expected_num_eval_batches_found) + + # Test --eval_during_training_every_n_epochs + every_n_epochs = (2 * base_params.batch_size * base_params.num_gpus / + datasets.IMAGENET_NUM_TRAIN_IMAGES) + self._testEvalDuringTraining( + base_params._replace(eval_during_training_every_n_epochs=every_n_epochs, + variable_update='replicated'), + expected_num_eval_batches_found) + + # Test --eval_during_training_at_specified_steps + list_steps = [2, 3, 5, 7, 1000] + num_eval_steps = 1 + sum(1 for step in list_steps + if step < base_params.num_batches) + expected_num_eval_batches_found = ( + base_params.num_eval_batches * num_eval_steps) + + self._testEvalDuringTraining( + base_params._replace(eval_during_training_at_specified_steps=list_steps, + variable_update='replicated'), + expected_num_eval_batches_found) + + # Test --eval_during_training_at_specified_epochs + list_epochs = [(step * base_params.batch_size * base_params.num_gpus / + datasets.IMAGENET_NUM_TRAIN_IMAGES) + for step in list_steps] + self._testEvalDuringTraining( + base_params._replace( + eval_during_training_at_specified_epochs=list_epochs, + variable_update='replicated'), + expected_num_eval_batches_found) + + # Test --eval_during_training_every_n_steps runs with synthetic data. + params = base_params._replace( + variable_update='replicated', data_dir=None, + eval_during_training_every_n_steps=2, num_batches=2) + benchmark_cnn.BenchmarkCNN(params).run() + + def testEvalDuringTrainingNumEpochs(self): + params = benchmark_cnn.make_params( + batch_size=1, eval_batch_size=2, eval_during_training_every_n_steps=1, + num_batches=30, num_eval_epochs=100 / datasets.IMAGENET_NUM_VAL_IMAGES) + bench_cnn = benchmark_cnn.BenchmarkCNN(params) + self.assertEqual(bench_cnn.num_batches, 30) + self.assertAlmostEqual(bench_cnn.num_epochs, + 30 / datasets.IMAGENET_NUM_TRAIN_IMAGES) + self.assertAlmostEqual(bench_cnn.num_eval_batches, 50) + self.assertAlmostEqual(bench_cnn.num_eval_epochs, + 100 / datasets.IMAGENET_NUM_VAL_IMAGES) + + def testEarlyStopping(self): + params = benchmark_cnn.make_params( + batch_size=2, + display_every=1, + num_batches=100, + eval_during_training_every_n_steps=2, + stop_at_top_1_accuracy=0.4, + ) + with mock.patch.object(benchmark_cnn.BenchmarkCNN, '_eval_once', + side_effect=[(0.1, 0.1), (0.5, 0.5), (0.2, 0.2)] + ) as mock_eval_once: + logs = [] + bench_cnn = benchmark_cnn.BenchmarkCNN(params) + with test_util.monkey_patch(benchmark_cnn, + log_fn=test_util.print_and_add_to_list(logs)): + bench_cnn.run() + training_outputs = test_util.get_training_outputs_from_logs( + logs, print_training_accuracy=False) + # We should stop after the second evaluation, and we evaluate every 2 + # steps. So there should be 2 * 2 = 4 training outputs. + self.assertEqual(len(training_outputs), 4) + self.assertEqual(mock_eval_once.call_count, 2) + + def testOutOfRangeErrorsAreNotIgnored(self): + error_msg = 'Fake OutOfRangeError error message' + with mock.patch.object(benchmark_cnn.BenchmarkCNN, 'benchmark_with_session', + side_effect=tf.errors.OutOfRangeError(None, None, + error_msg)): + with self.assertRaisesRegex(RuntimeError, error_msg): + benchmark_cnn.BenchmarkCNN(benchmark_cnn.make_params()).run() + + def testInvalidFlags(self): + params = benchmark_cnn.make_params(device='cpu', data_format='NCHW') + with self.assertRaises(ValueError): + benchmark_cnn.BenchmarkCNN(params) + + params = benchmark_cnn.make_params(use_fp16=True, fp16_vars=True, + variable_update='replicated', + all_reduce_spec='nccl') + with self.assertRaises(ValueError): + benchmark_cnn.BenchmarkCNN(params) + + # Automatic loss scaling is only supported for 'replicated', 'ps', + # and 'independent' variable_updates. + invalid_variable_updates = [ + 'distributed_replicated', 'distributed_all_reduce' + ] + for variable_update in invalid_variable_updates: + params = benchmark_cnn.make_params( + use_fp16=True, + fp16_vars=True, + fp16_enable_auto_loss_scale=True, + variable_update=variable_update) + with self.assertRaises(ValueError): + benchmark_cnn.BenchmarkCNN(params) + + # Automatic loss scaling is not supported for 'nccl'. + params = benchmark_cnn.make_params( + use_fp16=True, + fp16_vars=True, + fp16_enable_auto_loss_scale=True, + all_reduce_spec='nccl') + with self.assertRaises(ValueError): + benchmark_cnn.BenchmarkCNN(params) + + # Automatic loss scaling is not supported for 'staged_vars'. + params = benchmark_cnn.make_params( + use_fp16=True, + fp16_vars=True, + fp16_enable_auto_loss_scale=True, + staged_vars=True) + with self.assertRaises(ValueError): + benchmark_cnn.BenchmarkCNN(params) + + def testMakeParams(self): + default_params = benchmark_cnn.make_params() + self.assertEqual(default_params.model, + flags.param_specs['model'].default_value) + params = benchmark_cnn.make_params(model='foo') + self.assertEqual(params.model, 'foo') + with self.assertRaises(ValueError): + benchmark_cnn.make_params(job_name='foo') + with self.assertRaises(ValueError): + benchmark_cnn.make_params(gpu_memory_frac_for_testing=-1.) + + +class VariableUpdateTest(tf.test.TestCase): + """Tests that variables are updated correctly. + + These tests use a very simple deterministic model. For example, some tests use + the model + + loss = image * A * B + + where image is a 1x1 images (with a single scalar value), and A and B are + scalar variables. Tests will run tf_cnn_benchmarks with such a model, on a + sequence of scalar images, and assert that the losses are the correct value. + Since the losses depend on the variables, this indirectly tests variables are + updated correctly. + """ + + def setUp(self): + super(VariableUpdateTest, self).setUp() + _check_has_gpu() + benchmark_cnn.setup(benchmark_cnn.make_params()) + + def _get_benchmark_cnn_losses(self, inputs, params): + """Returns the losses of BenchmarkCNN on the given inputs and params.""" + logs = [] + model = test_util.TestCNNModel() + with test_util.monkey_patch(benchmark_cnn, + log_fn=test_util.print_and_add_to_list(logs), + LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15): + bench = benchmark_cnn.BenchmarkCNN( + params, dataset=test_util.TestDataSet(), model=model) + # The test model does not use labels when computing loss, so the label + # values do not matter as long as it's the right shape. + labels = np.array([1] * inputs.shape[0]) + bench.input_preprocessor.set_fake_data(inputs, labels) + if bench.eval_input_preprocessor: + bench.eval_input_preprocessor.set_fake_data(inputs, labels) + bench.run() + + outputs = test_util.get_training_outputs_from_logs( + logs, params.print_training_accuracy) + return [x.loss for x in outputs] + + def _test_variable_update(self, params): + """Tests variables are updated correctly when the given params are used. + + A BenchmarkCNN is created with a TestCNNModel, and is run with some scalar + images. The losses are then compared with the losses obtained with + TestCNNModel().manually_compute_losses() + + Args: + params: a Params tuple used to create BenchmarkCNN. + """ + inputs = test_util.get_fake_var_update_inputs() + actual_losses = self._get_benchmark_cnn_losses(inputs, params) + expected_losses, = test_util.TestCNNModel().manually_compute_losses( + inputs, 1, params) + rtol = 3e-2 if params.use_fp16 else 1e-5 + self.assertAllClose(actual_losses[:len(expected_losses)], expected_losses, + rtol=rtol, atol=0.) + + def _test_variable_updates(self, params, + var_updates=('parameter_server', 'replicated')): + for var_update in var_updates: + self._test_variable_update(params._replace(variable_update=var_update)) + + def testDefault(self): + params = test_util.get_var_update_params() + self._test_variable_updates(params) + + # For some reason, this test doesn't always pass + + # def testCpuAsDevice(self): + # params = test_util.get_var_update_params()._replace( + # device='cpu', + # data_format='NHWC') # NHWC required when --device=cpu + # self._test_variable_updates(params) + + def testCpuAsLocalParamDevice(self): + params = test_util.get_var_update_params()._replace( + local_parameter_device='cpu') + self._test_variable_updates(params) + + def testFp16(self): + params = test_util.get_var_update_params()._replace(use_fp16=True) + self._test_variable_updates(params) + + def testMomentum(self): + params = test_util.get_var_update_params()._replace(optimizer='momentum') + self._test_variable_updates(params) + + def testRmsprop(self): + params = test_util.get_var_update_params()._replace(optimizer='rmsprop') + self._test_variable_updates(params) + + def testNoLayers(self): + params = test_util.get_var_update_params()._replace(use_tf_layers=False) + self._test_variable_updates(params) + + def testVariousAllReduceSpecs(self): + # We do not test xring, because it requires all Variables to have at least + # two elements. + params = test_util.get_var_update_params()._replace(all_reduce_spec='pscpu') + self._test_variable_updates(params, var_updates=('replicated',)) + params = params._replace(all_reduce_spec='psgpu') + self._test_variable_updates(params, var_updates=('replicated',)) + # TODO(b/80125832): Enable nccl in tests + # params = params._replace(all_reduce_spec='nccl', + # compact_gradient_transfer=False) + # self._test_variable_updates(params, var_updates=('replicated',)) + + def testPrintBaseLoss(self): + params = test_util.get_var_update_params()._replace( + loss_type_to_report='base_loss') + self._test_variable_updates(params) + + def testSingleL2LossOp(self): + params = test_util.get_var_update_params()._replace( + single_l2_loss_op=True) + self._test_variable_updates(params) + + def testResourceVars(self): + params = test_util.get_var_update_params()._replace( + use_resource_vars=True) + self._test_variable_updates(params) + + def testEvalDuringTrainingEveryNSteps(self): + # TODO(reedwm): Test that the eval results are correct. This only tests that + # training results are correct. + params = test_util.get_var_update_params()._replace( + eval_during_training_every_n_steps=1) + self._test_variable_updates(params, var_updates=('replicated',)) + + +class VariableMgrLocalReplicatedTest(tf.test.TestCase): + + def _test_grad_aggregation_with_var_mgr(self, variable_mgr, num_towers, + num_vars, deferred_grads): + tower_devices = ['/gpu:%d' % i for i in range(num_towers)] + tower_grads = [] + expected_sums = [0.] * num_vars + for i, tower_device in enumerate(tower_devices): + with tf.device(tower_device): + grad_vars = [] + for j in range(num_vars): + n = num_towers * i + j + grad_vars.append((tf.constant(n, dtype=tf.float32), + tf.Variable(n, dtype=tf.float32))) + expected_sums[j] += n + tower_grads.append(grad_vars) + + _, agg_device_grads = variable_mgr.preprocess_device_grads( + tower_grads) + expected_device_grads = [] + for i in range(num_towers): + expected_grad_vars = [] + for j in range(num_vars): + expected_grad_and_var = [expected_sums[j], num_towers * i + j] + if isinstance(agg_device_grads[i][j], tuple): + # agg_device_grads[i][j] can be a list or tuple. + expected_grad_and_var = tuple(expected_grad_and_var) + expected_grad_vars.append(expected_grad_and_var) + if isinstance(agg_device_grads[i], tuple): + # agg_device_grads[i] can be a list or tuple. + expected_grad_vars = tuple(expected_grad_vars) + expected_device_grads.append(expected_grad_vars) + config = tf.ConfigProto(allow_soft_placement=True) + with tf.Session(config=config) as sess: + sess.run(tf.initialize_all_variables()) + sess.run(variable_mgr._warmup_ops) + if deferred_grads: + # With deferred grads, the result of a session run is always the summed + # gradients from the previous session run. + sess.run(agg_device_grads) + feed_dict = {g: 0 for grad_vars in tower_grads for g, _ in grad_vars} + agg_device_grads_ = sess.run(agg_device_grads, feed_dict) + else: + agg_device_grads_ = sess.run(agg_device_grads) + self.assertEqual(agg_device_grads_, expected_device_grads) + + def _test_grad_aggregation(self, params, num_vars): + bench = benchmark_cnn.BenchmarkCNN(params) + deferred_grads = (params.variable_consistency == 'relaxed') + self._test_grad_aggregation_with_var_mgr(bench.variable_mgr, bench.num_gpus, + num_vars, deferred_grads) + + def test_grad_aggregation(self): + base_params = benchmark_cnn.make_params(num_gpus=10, + variable_update='replicated', + use_fp16=True) + params = base_params + self._test_grad_aggregation(params, 10) + params = base_params._replace(gradient_repacking=3) + self._test_grad_aggregation(params, 10) + params = base_params._replace(variable_consistency='relaxed') + self._test_grad_aggregation(params, 10) + params = base_params._replace(compact_gradient_transfer=False) + self._test_grad_aggregation(params, 10) + params = base_params._replace(gradient_repacking=3, + variable_consistency='relaxed') + self._test_grad_aggregation(params, 10) + params = base_params._replace(gradient_repacking=3, + compact_gradient_transfer=False) + self._test_grad_aggregation(params, 10) + params = base_params._replace(variable_consistency='relaxed', + compact_gradient_transfer=False) + self._test_grad_aggregation(params, 10) + params = base_params._replace(gradient_repacking=3, + variable_consistency='relaxed', + compact_gradient_transfer=False) + self._test_grad_aggregation(params, 10) + params = base_params._replace(num_gpus=8, hierarchical_copy=True) + self._test_grad_aggregation(params, 10) + # TODO(b/80125832): Enable nccl in tests + # params = base_params._replace(all_reduce_spec='nccl', + # compact_gradient_transfer=False, + # # For some reason, this test freezes when + # # num_gpus=10 + # num_gpus=8) + # self._test_grad_aggregation(params, 10) + params = base_params._replace(all_reduce_spec='pscpu') + self._test_grad_aggregation(params, 10) + + params = base_params._replace(num_gpus=8, + gradient_repacking=3, + variable_consistency='relaxed', + hierarchical_copy=True) + self._test_grad_aggregation(params, 10) + # TODO(b/80125832): Enable nccl in tests + # params = base_params._replace(num_gpus=8, + # gradient_repacking=3, + # variable_consistency='relaxed', + # all_reduce_spec='nccl', + # compact_gradient_transfer=False) + # self._test_grad_aggregation(params, 10) + params = base_params._replace(gradient_repacking=3, + variable_consistency='relaxed', + all_reduce_spec='pscpu') + self._test_grad_aggregation(params, 10) + params = base_params._replace(gradient_repacking=3, + variable_consistency='relaxed', + all_reduce_spec='xring') + self._test_grad_aggregation(params, 10) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/cnn_util.py b/cv/classification/resnet50/tensorflow/cnn_util.py new file mode 100644 index 0000000000000000000000000000000000000000..09e2fe3501e1c49ce30ea9d2131229bf39ed5707 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/cnn_util.py @@ -0,0 +1,253 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Utilities for CNN benchmarks.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import threading + +import numpy as np +import tensorflow.compat.v1 as tf + + +def tensorflow_version_tuple(): + v = tf.__version__ + major, minor, patch = v.split('.') + return (int(major), int(minor), patch) + + +def tensorflow_version(): + vt = tensorflow_version_tuple() + return vt[0] * 1000 + vt[1] + + +def log_fn(log): + print(log) + + +def roll_numpy_batches(array, batch_size, shift_ratio): + """Moves a proportion of batches from start to the end of the array. + + This function moves a proportion of batches, specified by `shift_ratio`, from + the starts of the array to the end. The number of batches moved is rounded + down to the nearest integer. For example, + + ``` + roll_numpy_batches([1, 2, 3, 4, 5, 6], 2, 0.34) == [3, 4, 5, 6, 1, 2] + ``` + + Args: + array: A Numpy array whose first dimension is the batch dimension. + batch_size: The batch size. + shift_ratio: Proportion of batches to move from the start of the array to + the end of the array. + Returns: + A new Numpy array, with a proportion of the batches at the start of `array` + moved to the end. + """ + num_items = array.shape[0] + assert num_items % batch_size == 0 + num_batches = num_items // batch_size + starting_batch = int(num_batches * shift_ratio) + starting_item = starting_batch * batch_size + return np.roll(array, -starting_item, axis=0) + + +# For Python 2.7 compatibility, we do not use threading.Barrier. +class Barrier(object): + """Implements a lightweight Barrier. + + Useful for synchronizing a fixed number of threads at known synchronization + points. Threads block on 'wait()' and simultaneously return once they have + all made that call. + + # Implementation adopted from boost/thread/barrier.hpp + """ + + def __init__(self, parties): + """Create a barrier, initialised to 'parties' threads.""" + self.cond = threading.Condition(threading.Lock()) + self.parties = parties + # Indicates the number of waiting parties. + self.waiting = 0 + # generation is needed to deal with spurious wakeups. If self.cond.wait() + # wakes up for other reasons, generation will force it go back to wait(). + self.generation = 0 + self.broken = False + + def wait(self): + """Wait for the barrier.""" + with self.cond: + # Check if the barrier has been disabled or not. + if self.broken: + return + gen = self.generation + self.waiting += 1 + if self.waiting == self.parties: + self.waiting = 0 + self.generation += 1 + self.cond.notify_all() + # loop because of spurious wakeups + while gen == self.generation: + self.cond.wait() + + # TODO(huangyp): Remove this method once we find a way to know which step + # is the last barrier. + def abort(self): + """Clear existing barrier and disable this barrier.""" + with self.cond: + if self.waiting > 0: + self.generation += 1 + self.cond.notify_all() + self.broken = True + + +class ImageProducer(object): + """An image producer that puts images into a staging area periodically. + + This class is useful for periodically running a set of ops, `put_ops` on a + different thread every `batch_group_size` steps. + + The notify_image_consumption() method is used to increment an internal counter + so that every `batch_group_size` times it is called, `put_ops` is executed. A + barrier is placed so that notify_image_consumption() will block until + the previous call to `put_ops` has been executed. + + The start() method is used to start the thread that runs `put_ops`. + + The done() method waits until the last put_ops is executed and stops the + thread. + + The purpose of this class is to fill an image input pipeline every + `batch_group_size` steps. Suppose `put_ops` supplies `batch_group_size` images + to the input pipeline when run, and that every step, 1 batch of images is + consumed. Then, by calling notify_image_consumption() every step, images are + supplied to the input pipeline at the same amount they are consumed. + + Example usage: + ``` + put_ops = ... # Enqueues `batch_group_size` batches to a StagingArea + get_op = ... # Dequeues 1 batch, and does some operations on it + batch_group_size = 4 + with tf.Session() as sess: + image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size) + image_producer.start() + for _ in range(100): + sess.run(get_op) + image_producer.notify_image_consumption() + ``` + """ + + def __init__(self, sess, put_ops, batch_group_size, use_python32_barrier): + self.sess = sess + self.num_gets = 0 + self.put_ops = put_ops + self.batch_group_size = batch_group_size + self.done_event = threading.Event() + if (use_python32_barrier and + sys.version_info[0] == 3 and sys.version_info[1] >= 2): + self.put_barrier = threading.Barrier(2) + else: + self.put_barrier = Barrier(2) + + def _should_put(self): + return (self.num_gets + 1) % self.batch_group_size == 0 + + def done(self): + """Stop the image producer.""" + self.done_event.set() + self.put_barrier.abort() + self.thread.join() + + def start(self): + """Start the image producer.""" + self.sess.run([self.put_ops]) + self.thread = threading.Thread(target=self._loop_producer) + # Set daemon to true to allow Ctrl + C to terminate all threads. + self.thread.daemon = True + self.thread.start() + + def notify_image_consumption(self): + """Increment the counter of image_producer by 1. + + This should only be called by the main thread that consumes images and runs + the model computation. One batch of images should be consumed between + calling start() and the first call to this method. Then, one batch of images + should be consumed between any two successive calls to this method. + """ + if self._should_put(): + self.put_barrier.wait() + self.num_gets += 1 + + def _loop_producer(self): + while not self.done_event.isSet(): + self.sess.run([self.put_ops]) + self.put_barrier.wait() + + +class BaseClusterManager(object): + """The manager for the cluster of servers running the benchmark.""" + + def __init__(self, params): + worker_hosts = params.worker_hosts.split(',') + ps_hosts = params.ps_hosts.split(',') if params.ps_hosts else [] + cluster = {'worker': worker_hosts} + if ps_hosts: + cluster['ps'] = ps_hosts + self._cluster_spec = tf.train.ClusterSpec(cluster) + + def get_target(self): + """Returns a target to be passed to tf.Session().""" + raise NotImplementedError('get_target must be implemented by subclass') + + def join_server(self): + raise NotImplementedError('join must be implemented by subclass') + + def get_cluster_spec(self): + return self._cluster_spec + + def num_workers(self): + return len(self._cluster_spec.job_tasks('worker')) + + def num_ps(self): + if 'ps' in self._cluster_spec.jobs: + return len(self._cluster_spec.job_tasks('ps')) + else: + return 0 + + +class GrpcClusterManager(BaseClusterManager): + """A cluster manager for a cluster networked with gRPC.""" + + def __init__(self, params, config_proto): + super(GrpcClusterManager, self).__init__(params) + if params.job_name == 'controller': + self._target = 'grpc://%s' % self._cluster_spec.job_tasks('worker')[0] + else: + self._server = tf.train.Server(self._cluster_spec, + job_name=params.job_name, + task_index=params.task_index, + config=config_proto, + protocol=params.server_protocol) + self._target = self._server.target + + def get_target(self): + return self._target + + def join_server(self): + return self._server.join() diff --git a/cv/classification/resnet50/tensorflow/cnn_util_test.py b/cv/classification/resnet50/tensorflow/cnn_util_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7c245afbf8de9d72f8b9287e5a104f1ffd42bde8 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/cnn_util_test.py @@ -0,0 +1,129 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for tf_cnn_benchmarks.cnn_util.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import threading +import time + +import tensorflow.compat.v1 as tf + +import cnn_util + + +class CnnUtilBarrierTest(tf.test.TestCase): + + def testBarrier(self): + num_tasks = 20 + num_waits = 4 + barrier = cnn_util.Barrier(num_tasks) + threads = [] + sync_matrix = [] + for i in range(num_tasks): + sync_times = [0] * num_waits + thread = threading.Thread( + target=self._run_task, args=(barrier, sync_times)) + thread.start() + threads.append(thread) + sync_matrix.append(sync_times) + for thread in threads: + thread.join() + for wait_index in range(num_waits - 1): + # Max of times at iteration i < min of times at iteration i + 1 + self.assertLessEqual( + max([sync_matrix[i][wait_index] for i in range(num_tasks)]), + min([sync_matrix[i][wait_index + 1] for i in range(num_tasks)])) + + def _run_task(self, barrier, sync_times): + for wait_index in range(len(sync_times)): + sync_times[wait_index] = time.time() + barrier.wait() + + def testBarrierAbort(self): + num_tasks = 2 + num_waits = 1 + sync_times = [0] * num_waits + barrier = cnn_util.Barrier(num_tasks) + thread = threading.Thread( + target=self._run_task, args=(barrier, sync_times)) + thread.start() + barrier.abort() + # thread won't be blocked by done barrier. + thread.join() + + +class ImageProducerTest(tf.test.TestCase): + + def _slow_tensorflow_op(self): + """Returns a TensorFlow op that takes approximately 0.1s to complete.""" + def slow_func(v): + time.sleep(0.1) + return v + return tf.py_func(slow_func, [tf.constant(0.)], tf.float32).op + + def _test_image_producer(self, batch_group_size, put_slower_than_get): + # We use the variable x to simulate a staging area of images. x represents + # the number of batches in the staging area. + x = tf.Variable(0, dtype=tf.int32) + if put_slower_than_get: + put_dep = self._slow_tensorflow_op() + get_dep = tf.no_op() + else: + put_dep = tf.no_op() + get_dep = self._slow_tensorflow_op() + with tf.control_dependencies([put_dep]): + put_op = x.assign_add(batch_group_size, use_locking=True) + with tf.control_dependencies([get_dep]): + get_op = x.assign_sub(1, use_locking=True) + with self.test_session() as sess: + sess.run(tf.variables_initializer([x])) + image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size, + use_python32_barrier=False) + image_producer.start() + for _ in range(5 * batch_group_size): + sess.run(get_op) + # We assert x is nonnegative, to ensure image_producer never causes + # an unstage op to block. We assert x is at most 2 * batch_group_size, + # to ensure it doesn't use too much memory by storing too many batches + # in the staging area. + self.assertGreaterEqual(sess.run(x), 0) + self.assertLessEqual(sess.run(x), 2 * batch_group_size) + image_producer.notify_image_consumption() + self.assertGreaterEqual(sess.run(x), 0) + self.assertLessEqual(sess.run(x), 2 * batch_group_size) + + image_producer.done() + time.sleep(0.1) + self.assertGreaterEqual(sess.run(x), 0) + self.assertLessEqual(sess.run(x), 2 * batch_group_size) + + def test_image_producer(self): + self._test_image_producer(1, False) + self._test_image_producer(1, True) + self._test_image_producer(2, False) + self._test_image_producer(2, True) + self._test_image_producer(3, False) + self._test_image_producer(3, True) + self._test_image_producer(8, False) + self._test_image_producer(8, True) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/coco_metric.py b/cv/classification/resnet50/tensorflow/coco_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..d8ba67da47c79da96ec3d96feae91169cac7509c --- /dev/null +++ b/cv/classification/resnet50/tensorflow/coco_metric.py @@ -0,0 +1,198 @@ +# Copyright 2018 Google. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""COCO-style evaluation metrics. + +Forked from reference model implementation. + +COCO API: github.com/cocodataset/cocoapi/ +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import atexit +import tempfile + +from absl import flags + +import numpy as np +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval +import six + +import tensorflow.compat.v1 as tf + +import mlperf +import ssd_constants + +FLAGS = flags.FLAGS + + +# https://github.com/cocodataset/cocoapi/issues/49 +if six.PY3: + import pycocotools.coco + pycocotools.coco.unicode = str + + +def async_eval_runner(queue_predictions, queue_results, val_json_file): + """Load intermediate eval results and get COCO metrics.""" + while True: + message = queue_predictions.get() + if message == 'STOP': # poison pill + break + step, predictions = message + results = compute_map(predictions, val_json_file) + queue_results.put((step, results)) + + +def compute_map(predictions, val_json_file): + """Use model predictions to compute mAP. + + Args: + predictions: a list of tuples returned by decoded_predictions function, + each containing the following elements: + image source_id, box coordinates in XYWH order, probability score, label + val_json_file: path to COCO annotation file + Returns: + A dictionary that maps all COCO metrics (keys) to their values + """ + + if val_json_file.startswith("gs://"): + _, local_val_json = tempfile.mkstemp(suffix=".json") + tf.gfile.Remove(local_val_json) + + tf.gfile.Copy(val_json_file, local_val_json) + atexit.register(tf.gfile.Remove, local_val_json) + else: + local_val_json = val_json_file + + cocoGt = COCO(local_val_json) + cocoDt = cocoGt.loadRes(np.array(predictions)) + E = COCOeval(cocoGt, cocoDt, iouType='bbox') + E.evaluate() + E.accumulate() + E.summarize() + print("Current AP: {:.5f}".format(E.stats[0])) + metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', + 'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl'] + + # Prefix with "COCO" to group in TensorBoard. + return {"COCO/" + key: value for key, value in zip(metric_names, E.stats)} + + +def calc_iou(target, candidates): + target_tiled = np.tile(target[np.newaxis, :], (candidates.shape[0], 1)) + # Left Top & Right Bottom + lt = np.maximum(target_tiled[:,:2], candidates[:,:2]) + + rb = np.minimum(target_tiled[:,2:], candidates[:,2:]) + + delta = np.maximum(rb - lt, 0) + + intersect = delta[:,0] * delta[:,1] + + delta1 = target_tiled[:,2:] - candidates[:,:2] + area1 = delta1[:,0] * delta1[:,1] + delta2 = target_tiled[:,2:] - candidates[:,:2] + area2 = delta2[:,0] * delta2[:,1] + + iou = intersect/(area1 + area2 - intersect) + return iou + + +# TODO(haoyuzhang): Rewrite this NumPy based implementation to TensorFlow based +# implementation under ssd_model.py accuracy_function. +def decode_predictions(labels_and_predictions): + """Decode predictions and remove unused boxes and labels.""" + predictions = [] + for example in labels_and_predictions: + source_id = int(example[ssd_constants.SOURCE_ID]) + pred_box = example[ssd_constants.PRED_BOXES] + pred_scores = example[ssd_constants.PRED_SCORES] + + locs, labels, probs = decode_single( + pred_box, pred_scores, ssd_constants.OVERLAP_CRITERIA, + ssd_constants.MAX_NUM_EVAL_BOXES, ssd_constants.MAX_NUM_EVAL_BOXES) + + raw_height, raw_width, _ = example[ssd_constants.RAW_SHAPE] + for loc, label, prob in zip(locs, labels, probs): + # Ordering convention differs, hence [1], [0] rather than [0], [1] + x, y = loc[1] * raw_width, loc[0] * raw_height + w, h = (loc[3] - loc[1]) * raw_width, (loc[2] - loc[0]) * raw_height + predictions.append( + [source_id, x, y, w, h, prob, ssd_constants.CLASS_INV_MAP[label]]) + mlperf.logger.log(key=mlperf.tags.NMS_THRESHOLD, + value=ssd_constants.OVERLAP_CRITERIA) + mlperf.logger.log(key=mlperf.tags.NMS_MAX_DETECTIONS, + value=ssd_constants.MAX_NUM_EVAL_BOXES) + return predictions + + +def decode_single(bboxes_in, scores_in, criteria, max_output, max_num=200): + # Reference to https://github.com/amdegroot/ssd.pytorch + + bboxes_out = [] + scores_out = [] + labels_out = [] + + for i, score in enumerate(np.split(scores_in, scores_in.shape[1], 1)): + score = np.squeeze(score, 1) + + # skip background + if i == 0: + continue + + mask = score > ssd_constants.MIN_SCORE + if not np.any(mask): + continue + + bboxes, score = bboxes_in[mask, :], score[mask] + + score_idx_sorted = np.argsort(score) + score_sorted = score[score_idx_sorted] + + score_idx_sorted = score_idx_sorted[-max_num:] + candidates = [] + + # perform non-maximum suppression + while len(score_idx_sorted): + idx = score_idx_sorted[-1] + bboxes_sorted = bboxes[score_idx_sorted, :] + bboxes_idx = bboxes[idx, :] + iou = calc_iou(bboxes_idx, bboxes_sorted) + + score_idx_sorted = score_idx_sorted[iou < criteria] + candidates.append(idx) + + bboxes_out.append(bboxes[candidates, :]) + scores_out.append(score[candidates]) + labels_out.extend([i]*len(candidates)) + + if len(scores_out) == 0: + tf.logging.info("No objects detected. Returning dummy values.") + return ( + np.zeros(shape=(1, 4), dtype=np.float32), + np.zeros(shape=(1,), dtype=np.int32), + np.ones(shape=(1,), dtype=np.float32) * ssd_constants.DUMMY_SCORE, + ) + + bboxes_out = np.concatenate(bboxes_out, axis=0) + scores_out = np.concatenate(scores_out, axis=0) + labels_out = np.array(labels_out) + + max_ids = np.argsort(scores_out)[-max_output:] + + return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids] diff --git a/cv/classification/resnet50/tensorflow/constants.py b/cv/classification/resnet50/tensorflow/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..dbb32271bb2669e0ba12588d87d39f7c8924b161 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/constants.py @@ -0,0 +1,67 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Constants used in tf_cnn_benchmarks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from enum import Enum + +# Results fetched with this prefix will not be reduced. Instead, they will be +# passed as matrices to model's postprocess function. +UNREDUCED_ACCURACY_OP_PREFIX = "tensor:" + +# Eval result values with this name prefix will be included in summary. +SIMPLE_VALUE_RESULT_PREFIX = "simple_value:" + + +class BenchmarkMode(object): + """Benchmark running mode.""" + TRAIN = "training" + EVAL = "evaluation" + TRAIN_AND_EVAL = "training + evaluation" + FORWARD_ONLY = "forward only" + + +class NetworkTopology(str, Enum): + """Network topology describes how multiple GPUs are inter-connected. + """ + # DGX-1 uses hybrid cube mesh topology with the following device peer to peer + # matrix: + # DMA: 0 1 2 3 4 5 6 7 + # 0: Y Y Y Y Y N N N + # 1: Y Y Y Y N Y N N + # 2: Y Y Y Y N N Y N + # 3: Y Y Y Y N N N Y + # 4: Y N N N Y Y Y Y + # 5: N Y N N Y Y Y Y + # 6: N N Y N Y Y Y Y + # 7: N N N Y Y Y Y Y + DGX1 = "dgx1" + + # V100 in GCP are connected with the following device peer to peer matrix. + # In this topology, bandwidth of the connection depends on if it uses NVLink + # or PCIe link. + # DMA: 0 1 2 3 4 5 6 7 + # 0: Y Y Y Y N Y N N + # 1: Y Y Y Y N N N N + # 2: Y Y Y Y N N N Y + # 3: Y Y Y Y N N N N + # 4: N N N N Y Y Y Y + # 5: Y N N N Y Y Y Y + # 6: N N N N Y Y Y Y + # 7: N N Y N Y Y Y Y + GCP_V100 = "gcp_v100" diff --git a/cv/classification/resnet50/tensorflow/convnet_builder.py b/cv/classification/resnet50/tensorflow/convnet_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..9903de9247e7401b2982bb061fb6f4bdce7be179 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/convnet_builder.py @@ -0,0 +1,498 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""CNN builder.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import defaultdict +import contextlib + +import numpy as np + +import tensorflow.compat.v1 as tf + +# pylint: disable=g-direct-tensorflow-import +import mlperf +from tensorflow.python.layers import convolutional as conv_layers +from tensorflow.python.layers import core as core_layers +from tensorflow.python.layers import normalization as normalization_layers +from tensorflow.python.layers import pooling as pooling_layers +from tensorflow.python.training import moving_averages + + +_data_format_to_channel_axis = {'NCHW': 1, 'NHWC': 3} + + +class ConvNetBuilder(object): + """Builder of cnn net.""" + + def __init__(self, + input_op, + input_nchan, + phase_train, + use_tf_layers, + data_format='NCHW', + dtype=tf.float32, + variable_dtype=tf.float32): + self.top_layer = input_op + self.top_size = input_nchan + self.phase_train = phase_train + self.use_tf_layers = use_tf_layers + self.data_format = data_format + self.dtype = dtype + self.variable_dtype = variable_dtype + self.counts = defaultdict(lambda: 0) + self.use_batch_norm = False + self.batch_norm_config = {} # 'decay': 0.997, 'scale': True} + self.channel_pos = ('channels_last' + if data_format == 'NHWC' else 'channels_first') + self.aux_top_layer = None + self.aux_top_size = 0 + + def get_custom_getter(self): + """Returns a custom getter that this class's methods must be called under. + + All methods of this class must be called under a variable scope that was + passed this custom getter. Example: + + ```python + network = ConvNetBuilder(...) + with tf.variable_scope('cg', custom_getter=network.get_custom_getter()): + network.conv(...) + # Call more methods of network here + ``` + + Currently, this custom getter only does anything if self.use_tf_layers is + True. In that case, it causes variables to be stored as dtype + self.variable_type, then casted to the requested dtype, instead of directly + storing the variable as the requested dtype. + """ + def inner_custom_getter(getter, *args, **kwargs): + """Custom getter that forces variables to have type self.variable_type.""" + if not self.use_tf_layers: + return getter(*args, **kwargs) + requested_dtype = kwargs['dtype'] + if not (requested_dtype == tf.float32 and + self.variable_dtype == tf.float16): + # Only change the variable dtype if doing so does not decrease variable + # precision. + kwargs['dtype'] = self.variable_dtype + var = getter(*args, **kwargs) + # This if statement is needed to guard the cast, because batch norm + # assigns directly to the return value of this custom getter. The cast + # makes the return value not a variable so it cannot be assigned. Batch + # norm variables are always in fp32 so this if statement is never + # triggered for them. + if var.dtype.base_dtype != requested_dtype: + var = tf.cast(var, requested_dtype) + return var + return inner_custom_getter + + @contextlib.contextmanager + def switch_to_aux_top_layer(self): + """Context that construct cnn in the auxiliary arm.""" + if self.aux_top_layer is None: + raise RuntimeError('Empty auxiliary top layer in the network.') + saved_top_layer = self.top_layer + saved_top_size = self.top_size + self.top_layer = self.aux_top_layer + self.top_size = self.aux_top_size + yield + self.aux_top_layer = self.top_layer + self.aux_top_size = self.top_size + self.top_layer = saved_top_layer + self.top_size = saved_top_size + + def get_variable(self, name, shape, dtype, cast_dtype, *args, **kwargs): + # TODO(reedwm): Currently variables and gradients are transferred to other + # devices and machines as type `dtype`, not `cast_dtype`. In particular, + # this means in fp16 mode, variables are transferred as fp32 values, not + # fp16 values, which uses extra bandwidth. + var = tf.get_variable(name, shape, dtype, *args, **kwargs) + return tf.cast(var, cast_dtype) + + def _conv2d_impl(self, input_layer, num_channels_in, filters, kernel_size, + strides, padding, kernel_initializer): + if self.use_tf_layers: + return conv_layers.conv2d(input_layer, filters, kernel_size, strides, + padding, self.channel_pos, + kernel_initializer=kernel_initializer, + use_bias=False) + else: + weights_shape = [kernel_size[0], kernel_size[1], num_channels_in, filters] + # We use the name 'conv2d/kernel' so the variable has the same name as its + # tf.layers equivalent. This way, if a checkpoint is written when + # self.use_tf_layers == True, it can be loaded when + # self.use_tf_layers == False, and vice versa. + weights = self.get_variable('conv2d/kernel', weights_shape, + self.variable_dtype, self.dtype, + initializer=kernel_initializer) + if self.data_format == 'NHWC': + strides = [1] + strides + [1] + else: + strides = [1, 1] + strides + return tf.nn.conv2d(input_layer, weights, strides, padding, + data_format=self.data_format) + + def conv(self, + num_out_channels, + k_height, + k_width, + d_height=1, + d_width=1, + mode='SAME', + input_layer=None, + num_channels_in=None, + use_batch_norm=None, + stddev=None, + activation='relu', + bias=0.0, + kernel_initializer=None): + """Construct a conv2d layer on top of cnn.""" + if input_layer is None: + input_layer = self.top_layer + if num_channels_in is None: + num_channels_in = self.top_size + if stddev is not None and kernel_initializer is None: + kernel_initializer = tf.truncated_normal_initializer(stddev=stddev) + if kernel_initializer is None: + kernel_initializer = tf.variance_scaling_initializer() + name = 'conv' + str(self.counts['conv']) + self.counts['conv'] += 1 + with tf.variable_scope(name): + strides = [1, d_height, d_width, 1] + if self.data_format == 'NCHW': + strides = [strides[0], strides[3], strides[1], strides[2]] + if mode != 'SAME_RESNET': + conv = self._conv2d_impl(input_layer, num_channels_in, num_out_channels, + kernel_size=[k_height, k_width], + strides=[d_height, d_width], padding=mode, + kernel_initializer=kernel_initializer) + else: # Special padding mode for ResNet models + if d_height == 1 and d_width == 1: + conv = self._conv2d_impl(input_layer, num_channels_in, + num_out_channels, + kernel_size=[k_height, k_width], + strides=[d_height, d_width], padding='SAME', + kernel_initializer=kernel_initializer) + else: + rate = 1 # Unused (for 'a trous' convolutions) + kernel_height_effective = k_height + (k_height - 1) * (rate - 1) + pad_h_beg = (kernel_height_effective - 1) // 2 + pad_h_end = kernel_height_effective - 1 - pad_h_beg + kernel_width_effective = k_width + (k_width - 1) * (rate - 1) + pad_w_beg = (kernel_width_effective - 1) // 2 + pad_w_end = kernel_width_effective - 1 - pad_w_beg + padding = [[0, 0], [pad_h_beg, pad_h_end], + [pad_w_beg, pad_w_end], [0, 0]] + if self.data_format == 'NCHW': + padding = [padding[0], padding[3], padding[1], padding[2]] + padded_input_layer = tf.pad(input_layer, padding) + conv = self._conv2d_impl(padded_input_layer, num_channels_in, + num_out_channels, + kernel_size=[k_height, k_width], + strides=[d_height, d_width], padding='VALID', + kernel_initializer=kernel_initializer) + if use_batch_norm is None: + use_batch_norm = self.use_batch_norm + mlperf.logger.log_conv2d(input_tensor=input_layer, output_tensor=conv, + stride_height=d_height, stride_width=d_width, + filters=num_out_channels, + initializer=kernel_initializer, + use_bias=not use_batch_norm and bias is not None) + if not use_batch_norm: + if bias is not None: + biases = self.get_variable('biases', [num_out_channels], + self.variable_dtype, self.dtype, + initializer=tf.constant_initializer(bias)) + biased = tf.reshape( + tf.nn.bias_add(conv, biases, data_format=self.data_format), + conv.get_shape()) + else: + biased = conv + else: + self.top_layer = conv + self.top_size = num_out_channels + biased = self.batch_norm(**self.batch_norm_config) + if activation == 'relu': + mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU) + conv1 = tf.nn.relu(biased) + elif activation == 'linear' or activation is None: + conv1 = biased + elif activation == 'tanh': + conv1 = tf.nn.tanh(biased) + else: + raise KeyError('Invalid activation type \'%s\'' % activation) + self.top_layer = conv1 + self.top_size = num_out_channels + return conv1 + + def _pool(self, + pool_name, + pool_function, + k_height, + k_width, + d_height, + d_width, + mode, + input_layer, + num_channels_in): + """Construct a pooling layer.""" + if input_layer is None: + input_layer = self.top_layer + else: + self.top_size = num_channels_in + name = pool_name + str(self.counts[pool_name]) + self.counts[pool_name] += 1 + if self.use_tf_layers: + pool = pool_function( + input_layer, [k_height, k_width], [d_height, d_width], + padding=mode, + data_format=self.channel_pos, + name=name) + else: + if self.data_format == 'NHWC': + ksize = [1, k_height, k_width, 1] + strides = [1, d_height, d_width, 1] + else: + ksize = [1, 1, k_height, k_width] + strides = [1, 1, d_height, d_width] + pool = tf.nn.max_pool(input_layer, ksize, strides, padding=mode, + data_format=self.data_format, name=name) + if pool_name == 'mpool': + mlperf.logger.log_max_pool(input_tensor=input_layer, + output_tensor=pool) + self.top_layer = pool + return pool + + def mpool(self, + k_height, + k_width, + d_height=2, + d_width=2, + mode='VALID', + input_layer=None, + num_channels_in=None): + """Construct a max pooling layer.""" + return self._pool('mpool', pooling_layers.max_pooling2d, k_height, k_width, + d_height, d_width, mode, input_layer, num_channels_in) + + def apool(self, + k_height, + k_width, + d_height=2, + d_width=2, + mode='VALID', + input_layer=None, + num_channels_in=None): + """Construct an average pooling layer.""" + return self._pool('apool', pooling_layers.average_pooling2d, k_height, + k_width, d_height, d_width, mode, input_layer, + num_channels_in) + + def reshape(self, shape, input_layer=None): + if input_layer is None: + input_layer = self.top_layer + self.top_layer = tf.reshape(input_layer, shape) + self.top_size = shape[-1] # HACK This may not always work + return self.top_layer + + def affine(self, + num_out_channels, + input_layer=None, + num_channels_in=None, + bias=0.0, + stddev=None, + activation='relu'): + if input_layer is None: + input_layer = self.top_layer + if num_channels_in is None: + num_channels_in = self.top_size + name = 'affine' + str(self.counts['affine']) + self.counts['affine'] += 1 + with tf.variable_scope(name): + init_factor = 2. if activation == 'relu' else 1. + stddev = stddev or np.sqrt(init_factor / num_channels_in) + kernel = self.get_variable( + 'weights', [num_channels_in, num_out_channels], + self.variable_dtype, self.dtype, + initializer=tf.truncated_normal_initializer(stddev=stddev)) + biases = self.get_variable('biases', [num_out_channels], + self.variable_dtype, self.dtype, + initializer=tf.constant_initializer(bias)) + mlperf.logger.log(key=mlperf.tags.MODEL_HP_DENSE, + value=num_out_channels) + logits = tf.nn.xw_plus_b(input_layer, kernel, biases) + if activation == 'relu': + mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU) + affine1 = tf.nn.relu(logits, name=name) + elif activation == 'linear' or activation is None: + affine1 = logits + else: + raise KeyError('Invalid activation type \'%s\'' % activation) + self.top_layer = affine1 + self.top_size = num_out_channels + return affine1 + + def inception_module(self, name, cols, input_layer=None, in_size=None): + if input_layer is None: + input_layer = self.top_layer + if in_size is None: + in_size = self.top_size + name += str(self.counts[name]) + self.counts[name] += 1 + with tf.variable_scope(name): + col_layers = [] + col_layer_sizes = [] + for c, col in enumerate(cols): + col_layers.append([]) + col_layer_sizes.append([]) + for l, layer in enumerate(col): + ltype, args = layer[0], layer[1:] + kwargs = { + 'input_layer': input_layer, + 'num_channels_in': in_size + } if l == 0 else {} + if ltype == 'conv': + self.conv(*args, **kwargs) + elif ltype == 'mpool': + self.mpool(*args, **kwargs) + elif ltype == 'apool': + self.apool(*args, **kwargs) + elif ltype == 'share': # Share matching layer from previous column + self.top_layer = col_layers[c - 1][l] + self.top_size = col_layer_sizes[c - 1][l] + else: + raise KeyError( + 'Invalid layer type for inception module: \'%s\'' % ltype) + col_layers[c].append(self.top_layer) + col_layer_sizes[c].append(self.top_size) + catdim = 3 if self.data_format == 'NHWC' else 1 + self.top_layer = tf.concat([layers[-1] for layers in col_layers], catdim) + self.top_size = sum([sizes[-1] for sizes in col_layer_sizes]) + return self.top_layer + + def spatial_mean(self, keep_dims=False): + name = 'spatial_mean' + str(self.counts['spatial_mean']) + self.counts['spatial_mean'] += 1 + axes = [1, 2] if self.data_format == 'NHWC' else [2, 3] + self.top_layer = tf.reduce_mean( + self.top_layer, axes, keepdims=keep_dims, name=name) + return self.top_layer + + def dropout(self, keep_prob=0.5, input_layer=None): + if input_layer is None: + input_layer = self.top_layer + else: + self.top_size = None + name = 'dropout' + str(self.counts['dropout']) + with tf.variable_scope(name): + if not self.phase_train: + keep_prob = 1.0 + if self.use_tf_layers: + dropout = core_layers.dropout(input_layer, 1. - keep_prob, + training=self.phase_train) + else: + dropout = tf.nn.dropout(input_layer, keep_prob) + self.top_layer = dropout + return dropout + + def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon): + """Batch normalization on `input_layer` without tf.layers.""" + # We make this function as similar as possible to the + # tf.contrib.layers.batch_norm, to minimize the differences between using + # layers and not using layers. + shape = input_layer.shape + num_channels = shape[3] if self.data_format == 'NHWC' else shape[1] + beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32, + initializer=tf.zeros_initializer()) + if use_scale: + gamma = self.get_variable('gamma', [num_channels], tf.float32, + tf.float32, initializer=tf.ones_initializer()) + else: + gamma = tf.constant(1.0, tf.float32, [num_channels]) + # For moving variables, we use tf.get_variable instead of self.get_variable, + # since self.get_variable returns the result of tf.cast which we cannot + # assign to. + moving_mean = tf.get_variable('moving_mean', [num_channels], + tf.float32, + initializer=tf.zeros_initializer(), + trainable=False) + moving_variance = tf.get_variable('moving_variance', [num_channels], + tf.float32, + initializer=tf.ones_initializer(), + trainable=False) + if self.phase_train: + bn, batch_mean, batch_variance = tf.nn.fused_batch_norm( + input_layer, gamma, beta, epsilon=epsilon, + data_format=self.data_format, is_training=True) + mean_update = moving_averages.assign_moving_average( + moving_mean, batch_mean, decay=decay, zero_debias=False) + variance_update = moving_averages.assign_moving_average( + moving_variance, batch_variance, decay=decay, zero_debias=False) + tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update) + tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update) + else: + bn, _, _ = tf.nn.fused_batch_norm( + input_layer, gamma, beta, mean=moving_mean, + variance=moving_variance, epsilon=epsilon, + data_format=self.data_format, is_training=False) + return bn + + def batch_norm(self, input_layer=None, decay=0.999, scale=False, + epsilon=0.001): + """Adds a Batch Normalization layer.""" + if input_layer is None: + input_layer = self.top_layer + else: + self.top_size = None + name = 'batchnorm' + str(self.counts['batchnorm']) + self.counts['batchnorm'] += 1 + + center = True + with tf.variable_scope(name) as scope: + if self.use_tf_layers: + layer_obj = normalization_layers.BatchNormalization( + momentum=decay, + scale=scale, + epsilon=epsilon, + fused=True, + axis=_data_format_to_channel_axis[self.data_format], + # We pass this 'scope' argument for compatibility with checkpoints + # created with the contrib version of batch norm. tf_cnn_benchmarks + # used to use the contrib version. + _scope=scope, + center=center, + name=scope.name) + bn = layer_obj.apply(input_layer, training=self.phase_train) + else: + bn = self._batch_norm_without_layers(input_layer, decay, scale, epsilon) + self.top_layer = bn + self.top_size = bn.shape[3] if self.data_format == 'NHWC' else bn.shape[1] + self.top_size = int(self.top_size) + mlperf.logger.log_batch_norm( + input_tensor=input_layer, output_tensor=bn, momentum=decay, + epsilon=epsilon, center=center, scale=scale, training=self.phase_train) + return bn + + def lrn(self, depth_radius, bias, alpha, beta): + """Adds a local response normalization layer.""" + name = 'lrn' + str(self.counts['lrn']) + self.counts['lrn'] += 1 + self.top_layer = tf.nn.lrn( + self.top_layer, depth_radius, bias, alpha, beta, name=name) + return self.top_layer diff --git a/cv/classification/resnet50/tensorflow/datasets.py b/cv/classification/resnet50/tensorflow/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..e51743e3d37231256288636ead999b8d23eb3dfe --- /dev/null +++ b/cv/classification/resnet50/tensorflow/datasets.py @@ -0,0 +1,272 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Benchmark dataset utilities. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from abc import abstractmethod +import os + +import numpy as np +import six +from six.moves import cPickle +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow.compat.v1 as tf + +from tensorflow.python.platform import gfile +import preprocessing + +IMAGENET_NUM_TRAIN_IMAGES = 1281167 +IMAGENET_NUM_VAL_IMAGES = 50000 + +IMAGENETTE_NUM_TRAIN_IMAGES = 9469 +IMAGENETTE_NUM_VAL_IMAGES = 3925 + +COCO_NUM_TRAIN_IMAGES = 118287 +COCO_NUM_VAL_IMAGES = 4952 + + +class Dataset(object): + """Abstract class for cnn benchmarks dataset.""" + + def __init__(self, + name, + data_dir=None, + queue_runner_required=False, + num_classes=None): + self.name = name + self.data_dir = data_dir + self._queue_runner_required = queue_runner_required + self._num_classes = num_classes + + def tf_record_pattern(self, subset): + return os.path.join(self.data_dir, '%s-*-of-*' % subset) + + def reader(self): + return tf.TFRecordReader() + + @property + def num_classes(self): + return self._num_classes + + @num_classes.setter + def num_classes(self, val): + self._num_classes = val + + @abstractmethod + def num_examples_per_epoch(self, subset): + pass + + def __str__(self): + return self.name + + def get_input_preprocessor(self, input_preprocessor='default'): + assert not self.use_synthetic_gpu_inputs() + return _SUPPORTED_INPUT_PREPROCESSORS[self.name][input_preprocessor] + + def queue_runner_required(self): + return self._queue_runner_required + + def use_synthetic_gpu_inputs(self): + return not self.data_dir + + +class LibrispeechDataset(Dataset): + """Configuration for LibriSpeech dataset.""" + + def __init__(self, data_dir=None): + super(LibrispeechDataset, self).__init__( + 'librispeech', data_dir, num_classes=29) + + def tf_record_pattern(self, subset): + if subset == 'train': + return os.path.join(self.data_dir, 'train-clean-*.tfrecords') + elif subset == 'validation': + return os.path.join(self.data_dir, 'test-clean.tfrecords') + else: + return '' + + def num_examples_per_epoch(self, subset='train'): + del subset + return 2 # TODO(laigd): currently this is an arbitrary number. + + +class ImageDataset(Dataset): + """Abstract class for image datasets.""" + + def __init__(self, + name, + height, + width, + depth=None, + data_dir=None, + queue_runner_required=False, + num_classes=1001): + super(ImageDataset, self).__init__(name, data_dir, queue_runner_required, + num_classes) + self.height = height + self.width = width + self.depth = depth or 3 + + +class ImagenetDataset(ImageDataset): + """Configuration for Imagenet dataset.""" + + def __init__(self, data_dir=None): + super(ImagenetDataset, self).__init__( + 'imagenet', 300, 300, data_dir=data_dir) + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return IMAGENET_NUM_TRAIN_IMAGES + elif subset == 'validation': + return IMAGENET_NUM_VAL_IMAGES + else: + raise ValueError('Invalid data subset "%s"' % subset) + +class ImagenetteDataset(ImageDataset): + """Configuration for Imagenette dataset.""" + def __init__(self, data_dir=None): + super(ImagenetteDataset, self).__init__( + 'imagenette', 300, 300, data_dir=data_dir, num_classes=10) + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return IMAGENETTE_NUM_TRAIN_IMAGES + elif subset == 'validation': + return IMAGENETTE_NUM_VAL_IMAGES + else: + raise ValueError('Invalid data subset "%s"' % subset) + +class Cifar10Dataset(ImageDataset): + """Configuration for cifar 10 dataset. + + It will mount all the input images to memory. + """ + + def __init__(self, data_dir=None): + super(Cifar10Dataset, self).__init__( + 'cifar10', + 32, + 32, + data_dir=data_dir, + queue_runner_required=True, + num_classes=11) + + def read_data_files(self, subset='train'): + """Reads from data file and returns images and labels in a numpy array.""" + assert self.data_dir, ('Cannot call `read_data_files` when using synthetic ' + 'data') + if subset == 'train': + filenames = [ + os.path.join(self.data_dir, 'data_batch_%d' % i) + for i in xrange(1, 6) + ] + elif subset == 'validation': + filenames = [os.path.join(self.data_dir, 'test_batch')] + else: + raise ValueError('Invalid data subset "%s"' % subset) + + inputs = [] + for filename in filenames: + with gfile.Open(filename, 'rb') as f: + # python2 does not have the encoding parameter + encoding = {} if six.PY2 else {'encoding': 'bytes'} + inputs.append(cPickle.load(f, **encoding)) + # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the + # input format. + all_images = np.concatenate( + [each_input[b'data'] for each_input in inputs]).astype(np.float32) + all_labels = np.concatenate( + [each_input[b'labels'] for each_input in inputs]) + return all_images, all_labels + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return 50000 + elif subset == 'validation': + return 10000 + else: + raise ValueError('Invalid data subset "%s"' % subset) + + +class COCODataset(ImageDataset): + """COnfiguration for COCO dataset.""" + + def __init__(self, data_dir=None, image_size=300): + super(COCODataset, self).__init__( + 'coco', image_size, image_size, data_dir=data_dir, num_classes=81) + + def num_examples_per_epoch(self, subset='train'): + if subset == 'train': + return COCO_NUM_TRAIN_IMAGES + elif subset == 'validation': + return COCO_NUM_VAL_IMAGES + else: + raise ValueError('Invalid data subset "%s"' % subset) + + +_SUPPORTED_DATASETS = { + 'imagenet': ImagenetDataset, + 'imagenette' : ImagenetteDataset, + 'cifar10': Cifar10Dataset, + 'librispeech': LibrispeechDataset, + 'coco': COCODataset, +} + +_SUPPORTED_INPUT_PREPROCESSORS = { + 'imagenet': { + 'default': preprocessing.RecordInputImagePreprocessor, + 'official_models_imagenet': preprocessing.ImagenetPreprocessor, + }, + 'imagenette': { + 'default': preprocessing.RecordInputImagePreprocessor, + 'official_models_imagenet': preprocessing.ImagenetPreprocessor, + }, + 'cifar10': { + 'default': preprocessing.Cifar10ImagePreprocessor + }, + 'librispeech': { + 'default': preprocessing.LibrispeechPreprocessor + }, + 'coco': { + 'default': preprocessing.COCOPreprocessor + }, +} + + +def create_dataset(data_dir, data_name): + """Create a Dataset instance based on data_dir and data_name.""" + if not data_dir and not data_name: + # When using synthetic data, use synthetic imagenet images by default. + data_name = 'imagenet' + + # Infere dataset name from data_dir if data_name is not provided. + if data_name is None: + for supported_name in _SUPPORTED_DATASETS: + if supported_name in data_dir: + data_name = supported_name + break + else: # Failed to identify dataset name from data dir. + raise ValueError('Could not identify name of dataset. ' + 'Please specify with --data_name option.') + if data_name not in _SUPPORTED_DATASETS: + raise ValueError('Unknown dataset. Must be one of %s' % ', '.join( + [key for key in sorted(_SUPPORTED_DATASETS.keys())])) + + return _SUPPORTED_DATASETS[data_name](data_dir) diff --git a/cv/classification/resnet50/tensorflow/download_script.sh b/cv/classification/resnet50/tensorflow/download_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..b51a687020423920f9a7d0cc1b8fc1ddf964b4bd --- /dev/null +++ b/cv/classification/resnet50/tensorflow/download_script.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +set -e +exit 0 \ No newline at end of file diff --git a/cv/classification/resnet50/tensorflow/flags.py b/cv/classification/resnet50/tensorflow/flags.py new file mode 100644 index 0000000000000000000000000000000000000000..f65898ae2e68c3d0891dd605b877b78cf108e6c0 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/flags.py @@ -0,0 +1,93 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains functions to define flags and params. + +Calling a DEFINE_* function will add a ParamSpec namedtuple to the param_spec +dict. The DEFINE_* arguments match those in absl. Calling define_flags() creates +a command-line flag for every ParamSpec defined by a DEFINE_* functions. + +The reason we don't use absl flags directly is that we want to be able to use +tf_cnn_benchmarks as a library. When using it as a library, we don't want to +define any flags, but instead pass parameters to the BenchmarkCNN constructor. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import namedtuple + +from absl import flags as absl_flags +import six + + +FLAGS = absl_flags.FLAGS + + +# ParamSpec describes one of benchmark_cnn.BenchmarkCNN's parameters. +ParamSpec = namedtuple('_ParamSpec', + ['flag_type', 'default_value', 'description', + 'kwargs']) + + +# Maps from parameter name to its ParamSpec. +param_specs = {} + + +def DEFINE_string(name, default, help): # pylint: disable=invalid-name,redefined-builtin + param_specs[name] = ParamSpec('string', default, help, {}) + + +def DEFINE_boolean(name, default, help): # pylint: disable=invalid-name,redefined-builtin + param_specs[name] = ParamSpec('boolean', default, help, {}) + + +def DEFINE_integer(name, default, help, lower_bound=None, upper_bound=None): # pylint: disable=invalid-name,redefined-builtin + kwargs = {'lower_bound': lower_bound, 'upper_bound': upper_bound} + param_specs[name] = ParamSpec('integer', default, help, kwargs) + + +def DEFINE_float(name, default, help, lower_bound=None, upper_bound=None): # pylint: disable=invalid-name,redefined-builtin + kwargs = {'lower_bound': lower_bound, 'upper_bound': upper_bound} + param_specs[name] = ParamSpec('float', default, help, kwargs) + + +def DEFINE_enum(name, default, enum_values, help): # pylint: disable=invalid-name,redefined-builtin + kwargs = {'enum_values': enum_values} + param_specs[name] = ParamSpec('enum', default, help, kwargs) + + +def DEFINE_list(name, default, help): # pylint: disable=invalid-name,redefined-builtin + param_specs[name] = ParamSpec('list', default, help, {}) + + +def define_flags(specs=None): + """Define a command line flag for each ParamSpec in flags.param_specs.""" + specs = specs or param_specs + define_flag = { + 'boolean': absl_flags.DEFINE_boolean, + 'float': absl_flags.DEFINE_float, + 'integer': absl_flags.DEFINE_integer, + 'string': absl_flags.DEFINE_string, + 'enum': absl_flags.DEFINE_enum, + 'list': absl_flags.DEFINE_list + } + for name, param_spec in six.iteritems(specs): + if param_spec.flag_type not in define_flag: + raise ValueError('Unknown flag_type %s' % param_spec.flag_type) + else: + define_flag[param_spec.flag_type](name, param_spec.default_value, + help=param_spec.description, + **param_spec.kwargs) diff --git a/cv/classification/resnet50/tensorflow/get_imagenette.sh b/cv/classification/resnet50/tensorflow/get_imagenette.sh new file mode 100644 index 0000000000000000000000000000000000000000..460e635f9ffdcc3a58c5980c7f299440691fd941 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/get_imagenette.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +: ${DATA_DIR:="./"} + + +if [ ! -d "./imagenette" ]; then + echo "Make soft link form ${DATA_DIR} to tf_cnn_benckmarks" + ln -s "${DATA_DIR}/imagenette_tfrecord" imagenette +fi + diff --git a/cv/classification/resnet50/tensorflow/get_num_devices.sh b/cv/classification/resnet50/tensorflow/get_num_devices.sh new file mode 100644 index 0000000000000000000000000000000000000000..1637c5a1f95e53979cc9e76a0f2ec7eab9fea564 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/get_num_devices.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +devices=$CUDA_VISIBLE_DEVICES +if [ -n "$devices" ]; then + _devices=(${devices//,/ }) + num_devices=${#_devices[@]} +else + num_devices=2 + export CUDA_VISIBLE_DEVICES=0,1 + echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}" +fi +export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} \ No newline at end of file diff --git a/cv/classification/resnet50/tensorflow/leading_indicators_test.py b/cv/classification/resnet50/tensorflow/leading_indicators_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1bd8715261afc5e19ca4484fe95c81f6c2330d26 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/leading_indicators_test.py @@ -0,0 +1,1003 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Benchmark various leading indicators CNNs. + +The purpose of these tests is to test each model as a high level baseline and +to ensure the various variable_update options have not regressing. Not all +options are tested. The tests focus on the most viable options. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import ctypes +import logging +import os +import sys + +from absl import flags +from absl.testing import absltest # pylint: disable=unused-import +import tensorflow.compat.v1 as tf # pylint: disable=g-bad-import-order +import benchmark_cnn +from platforms import util as platforms_util + +flags.DEFINE_integer('num_batches', None, + 'number of batches to run, excluding warmup') + + +class BenchmarkBase(tf.test.Benchmark): + """Base class for all benchmarks in this file.""" + + def __init__(self, output_dir=None, root_data_dir=None, **kwargs): + """Base class for all benchmarks in this file. + + Args: + output_dir: directory where to output e.g. log files + root_data_dir: directory under which to look for dataset + **kwargs: arbitrary named arguments. This is needed to make the + constructor forward compatible in case PerfZero provides more + named arguments before updating the constructor. + """ + + # Load default values if the benchmark is not run with absl.app.run() + if not flags.FLAGS.is_parsed(): + flags.FLAGS.mark_as_parsed() + + self.fake_data_dir = os.path.join(platforms_util.get_test_data_dir(), + 'fake_tf_record_data') + self.output_dir = output_dir + if root_data_dir is None: + self.data_dir = ('/readahead/200M/placer/prod/home/distbelief/' + 'imagenet-tensorflow/imagenet-2012-tfrecord') + else: + self.data_dir = os.path.join(root_data_dir, 'imagenet') + + def _run_benchmark(self, params): + """Run a CNN benchmark and report its results. + + Args: + params: Params tuple, typically created by benchmark_cnn.make_params or + benchmark_cnn.make_params_from_flags. + """ + logging.info('Running benchmark [%s]', self._get_name()) + params = benchmark_cnn.setup(params) + bench = benchmark_cnn.BenchmarkCNN(params) + bench.print_info() + stats = bench.run() + extras = {} + extras['examples_per_sec'] = stats.get('images_per_sec') + if 'last_average_loss' in stats: + extras['last_average_loss'] = stats['last_average_loss'] + if 'top_1_accuracy' in stats: + extras['top_1_accuracy'] = stats['top_1_accuracy'] + if 'top_5_accuracy' in stats: + extras['top_5_accuracy'] = stats['top_5_accuracy'] + self.report_benchmark( + iters=stats.get('num_steps'), + wall_time=stats.get('average_wall_time'), + extras=extras) + + def _shared_params(self): + """Returns shared parameters for all benchmarks in this file.""" + params = {} + if flags.FLAGS.num_batches is not None: + params['num_batches'] = flags.FLAGS.num_batches + if self.output_dir is not None: + params['benchmark_log_dir'] = self.output_dir + return benchmark_cnn.make_params(**params) + + def _binary_search_batch_size(self, params, init_batch_size): + """Find the max batch_size using binary search.""" + assert init_batch_size > 0 + low_batch_size = 0 + high_batch_size = None + batch_size = init_batch_size + + # No need to run a warmup or many batches; if it doesn't OOM after 10 + # batches, it should work in general. + params = params._replace(num_batches=10, num_warmup_batches=0) + + # Find high_batch_size first. + tf.logging.info( + 'Looking for upper bound to batch size, starting with %d' % batch_size) + while high_batch_size is None: + tf.logging.info('Trying batch_size %d' % batch_size) + params = params._replace(batch_size=batch_size) + bench = benchmark_cnn.BenchmarkCNN(params) + bench.print_info() + try: + bench.run() + low_batch_size = batch_size + batch_size *= 2 + except tf.errors.ResourceExhaustedError: + high_batch_size = batch_size - 1 + + # Binary Search + tf.logging.info( + 'Max batch size is in range (%d, %d]. Starting binary search to find ' + 'exact max batch size.' % (low_batch_size, batch_size)) + while low_batch_size < high_batch_size: + batch_size = (low_batch_size + high_batch_size + 1) // 2 + tf.logging.info('Trying batch_size %d' % batch_size) + params = params._replace(batch_size=batch_size) + bench = benchmark_cnn.BenchmarkCNN(params) + bench.print_info() + try: + bench.run() + low_batch_size = batch_size + except tf.errors.ResourceExhaustedError: + high_batch_size = batch_size - 1 + self.report_benchmark(extras={'max_batch_size': low_batch_size}) + + +class Resnet50BenchmarksInferenceCpu(BenchmarkBase): + """"Benchmarks for ResNet50 inference on CPU.""" + + def _shared_params(self): + """Returns shared parameters for all ResNet50 benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + num_gpus=1, + model='resnet50', + num_warmup_batches=5, + num_batches=50, + distortions=False, + forward_only=True, + device='cpu', + data_format='NHWC', + num_intra_threads=0) + + def benchmark_synth_forward_batch1(self): + """Tests 1 CPU batch size 1.""" + params = self._shared_params()._replace(batch_size=1) + self._run_benchmark(params) + + def benchmark_synth_forward_batch16(self): + """Tests 1 CPU batch size 16.""" + params = self._shared_params()._replace(batch_size=16) + self._run_benchmark(params) + + +class FrozenResnet50BenchmarksInferenceCpu(Resnet50BenchmarksInferenceCpu): + """"Benchmarks for ResNet50 frozen graph inference on CPU.""" + + def _shared_params(self): + return super(FrozenResnet50BenchmarksInferenceCpu, + self)._shared_params()._replace(freeze_when_forward_only=True) + + +class Resnet50BenchmarksInference(BenchmarkBase): + """"Benchmarks for ResNet50 inference.""" + + def _shared_params(self): + """Returns shared parameters for all ResNet50 benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + num_gpus=1, model='resnet50', distortions=False, forward_only=True) + + def benchmark_synth_forward_batch128(self): + """Tests 1 GPU batch size 128.""" + params = self._shared_params()._replace(batch_size=128) + self._run_benchmark(params) + + def benchmark_fp16_synth_forward_batch128(self): + """Tests 1 GPU batch size 128 FP16.""" + params = self._shared_params()._replace(batch_size=128, use_fp16=True) + self._run_benchmark(params) + + def benchmark_fp16_synth_forward_batch16(self): + """Tests 1 GPU batch size 16 FP16.""" + params = self._shared_params()._replace(batch_size=16, use_fp16=True) + self._run_benchmark(params) + + def benchmark_xla_synth_forward_batch128(self): + """Tests 1 GPU batch size 128 with XLA.""" + params = self._shared_params()._replace(batch_size=128, xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_synth_forward_batch128(self): + """Tests 1 GPU batch size 128 FP16 with XLA.""" + params = self._shared_params()._replace( + batch_size=128, use_fp16=True, xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_synth_forward_batch16(self): + """Tests 1 GPU batch size 16 FP16 with XLA.""" + params = self._shared_params()._replace( + batch_size=16, use_fp16=True, xla=True) + self._run_benchmark(params) + + +class FrozenResnet50BenchmarksInference(Resnet50BenchmarksInference): + """"Benchmarks for ResNet50 frozen graph inference.""" + + def _shared_params(self): + return super(FrozenResnet50BenchmarksInference, + self)._shared_params()._replace(freeze_when_forward_only=True) + + def benchmark_trt_synth_forward_batch128(self): + """Tests 1 GPU batch size 128.""" + params = self._shared_params()._replace(batch_size=128, trt_mode='FP32') + self._run_benchmark(params) + + # TODO(laigd): enable fp16 tests for TF-TRT, it's currently not supported yet. + # def benchmark_fp16_trt_synth_forward_batch128(self): + # """Tests 1 GPU batch size 128 FP16.""" + # params = self._shared_params()._replace( + # batch_size=128, use_fp16=True, trt_mode='FP16') + # self._run_benchmark(params) + + # Test with batch size 16 to compare with native TF GPU implementation and + # XLA. + # def benchmark_fp16_trt_synth_forward_batch16(self): + # """Tests 1 GPU batch size 16 FP16.""" + # params = self._shared_params()._replace( + # batch_size=16, use_fp16=True, trt_mode='FP16') + # self._run_benchmark(params) + + +class Resnet50Benchmarks(BenchmarkBase): + """"Benchmark resnet50 configurations.""" + + def _shared_params(self): + """Returns shared parameters for all ResNet50 benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + model='resnet50', batch_size=128, distortions=False, + optimizer='momentum') + + def _shared_params_fp16(self): + """Returns shared parameters for all ResNet50 FP16 benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + model='resnet50', + batch_size=256, + distortions=False, + use_fp16=True, + optimizer='momentum', + loss_type_to_report='base_loss', + compute_lr_on_cpu=True, + single_l2_loss_op=True + ) + + def benchmark_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data.""" + params = self._shared_params()._replace(num_gpus=1) + self._run_benchmark(params) + + def benchmark_fake_1gpu_gpuparams(self): + """Tests 1 gpu with fake data.""" + params = self._shared_params()._replace( + num_gpus=1, data_dir=self.fake_data_dir, data_name='imagenet') + self._run_benchmark(params) + + def benchmark_synth_1gpu_max_batch_size(self): + """Finds largest batch size that can be run with 1 gpu using synth data.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server') + self._binary_search_batch_size(params, init_batch_size=128) + + def benchmark_synth_4gpu_gpureplicated(self): + """Tests 4 gpu with synthetic data with parameters replicated.""" + params = self._shared_params()._replace( + num_gpus=4, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + def benchmark_synth_8gpu_gpureplicated(self): + """Tests 8 gpu with synthetic data with parameters replicated.""" + params = self._shared_params()._replace( + num_gpus=8, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + def benchmark_fake_8gpu_gpureplicated(self): + """Tests 8 gpu with fake data with parameters replicated.""" + params = self._shared_params()._replace( + num_gpus=8, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + # FP16 mixed-precision tests. + + def benchmark_fp16_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data with parameters on the gpu.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_fp16_synth_1gpu_gpuparams_batch128(self): + """Tests 1 gpu with synthetic data with parameters on the gpu.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, batch_size=128, variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_fp16_synth_4gpu_gpureplicated(self): + """Tests 4 gpu with synthetic data with nccl and all_reduce.""" + params = self._shared_params_fp16()._replace( + num_gpus=4, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + def benchmark_fp16_synth_8gpu_gpureplicated(self): + """Tests 8 gpu with synthetic with nccl and all_reduce.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + def benchmark_fp16_fake_1gpu_gpuparams(self): + """Tests 1 gpus with fake data.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_fp16_fake_8gpu_gpureplicated(self): + """Tests 8 gpus with fake data.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + def benchmark_fp16_fakedistort_8gpu_gpureplicated(self): + """Tests 8 gpus with fake distorted data.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, + data_dir=self.fake_data_dir, + data_name='imagenet', + distortions=True, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + # XLA versions of Resnet50 tests only for single GPU. + def benchmark_xla_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data with XLA.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_synth_1gpu_gpuparams(self): + """Tests 1 gpu with fp16, synthetic data with XLA.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, variable_update='parameter_server', xla=True) + self._run_benchmark(params) + + # Test does not run as part of continuous testing on guitar. + def benchmark_ng_xla_batch64_synth_1gpu_gpuparams(self): + """Tests 1 gpu with XLA, synth data, and batch 64.""" + params = self._shared_params()._replace( + num_gpus=1, batch_size=64, variable_update='parameter_server', xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_batch64_synth_1gpu_gpuparams(self): + """Tests 1 gpu with fp16, XLA, synth data, and batch 64.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, + batch_size=64, + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_batch128_synth_1gpu_gpuparams(self): + """Tests 1 gpu with fp16, XLA, and synth data.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, + batch_size=128, + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + def benchmark_xla_synth_1gpu_max_batch_size(self): + """Finds largest batch that can be run with XLA, 1 gpu, and synth data.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True) + self._binary_search_batch_size(params, init_batch_size=128) + + def benchmark_xla_real_1gpu_gpuparams(self): + """Tests 1 gpu with real data with XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.data_dir, + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + # Test does not run as part of continuous testing. + def benchmark_xla_fake_1gpu_gpuparams(self): + """Tests 1 gpu with fake data with XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + # Test does not run as part of continuous testing. + def benchmark_xla_fakedistort_1gpu_gpuparams(self): + """Tests 1 gpu with fake distorted data with XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.fake_data_dir, + data_name='imagenet', + distortions=True, + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + +class Resnet50v15Benchmarks(BenchmarkBase): + """"Benchmark various ResNet50V1.5 configurations. + + ResNetV1.5 differs from V1 in stride 2 is used in the first 3x3 convolution of + each block instead of the first 1x1 convolution. + """ + + def _shared_params_fp16(self): + """Returns shared parameters for all ResNet50v1.5 FP16 benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + model='resnet50_v1.5', + batch_size=256, + distortions=False, + use_fp16=True, + optimizer='momentum', + loss_type_to_report='base_loss', + compute_lr_on_cpu=True, + single_l2_loss_op=True + ) + + def benchmark_fp16_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data.""" + params = self._shared_params_fp16()._replace(num_gpus=1) + self._run_benchmark(params) + + def benchmark_fp16_batch256_synth_8gpu_gpuparams(self): + """Tests 8 gpus with synthetic data at batch 256.""" + params = self._shared_params_fp16()._replace(num_gpus=8) + self._run_benchmark(params) + + def benchmark_fp16_batch128_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data at batch 128 (useful for small GPUs).""" + params = self._shared_params_fp16()._replace(num_gpus=1, batch_size=128) + self._run_benchmark(params) + + def benchmark_fp16_fake_1gpu_gpuparams(self): + """Tests 1 gpu with fake data.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, data_dir=self.fake_data_dir, data_name='imagenet') + self._run_benchmark(params) + + def benchmark_fp16_synth_8gpu_gpureplicated(self): + """Tests 8 gpu with synthetic data with parameters replicated.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, + num_batches=200, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + def benchmark_fp16_fake_8gpu_gpureplicated(self): + """Tests 8 gpu with fake data with parameters replicated.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, + num_batches=200, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2) + self._run_benchmark(params) + + # XLA versions of Resnet50v1.5 tests. + def benchmark_fp16_xla_synth_1gpu_gpuparams(self): + """Tests 1 gpu with fp16, synthetic data with XLA.""" + params = self._shared_params_fp16()._replace(num_gpus=1, xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_batch128_synth_1gpu_gpuparams(self): + """Tests 1 gpu with fp16, batch128, synthetic data with XLA.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, batch_size=128, xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_compile_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data.""" + params = self._shared_params_fp16()._replace(num_gpus=1, xla_compile=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_compile_batch128_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data at batch 128 (useful for small GPUs).""" + params = self._shared_params_fp16()._replace( + num_gpus=1, num_batches=200, batch_size=128, xla_compile=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_batch256_synth_8gpu_gpuparams(self): + """Tests 8 gpu with synthetic data and xla autojit.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, num_batches=200, batch_size=256, xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_compile_fake_1gpu_gpuparams(self): + """Tests 1 gpu with fake data.""" + params = self._shared_params_fp16()._replace( + num_gpus=1, + data_dir=self.fake_data_dir, + data_name='imagenet', + xla_compile=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_compile_synth_8gpu_gpureplicated(self): + """Tests 8 gpu with synthetic data with parameters replicated.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, + num_batches=200, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2, + xla_compile=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_synth_8gpu_gpureplicated(self): + """Tests 8 gpu with synthetic data with parameters replicated.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, + num_batches=200, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2, + xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_compile_fake_8gpu_gpureplicated(self): + """Tests 8 gpu with fake data with parameters replicated.""" + params = self._shared_params_fp16()._replace( + num_gpus=8, + num_batches=200, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2, + xla_compile=True) + self._run_benchmark(params) + + +class Vgg16Benchmarks(BenchmarkBase): + """"Benchmark various vgg16 configurations.""" + + def _shared_params(self): + """Returns shared parameters for all vgg16 benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + model='vgg16', batch_size=128, distortions=False) + + def benchmark_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data with parameters on gpu.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_fp16_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data with parameters on gpu.""" + params = self._shared_params()._replace( + num_gpus=1, use_fp16=True, variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_synth_8gpu_gpureplicated(self): + """Tests 8 gpu with synthetic data with parameters replicated.""" + params = self._shared_params()._replace( + num_gpus=8, + all_reduce_spec='nccl', + variable_update='replicated', + compact_gradient_transfer=False, + gradient_repacking=2) + self._run_benchmark(params) + + # XLA versions of VGG16 tests only for single GPU. + def benchmark_xla_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_synth_1gpu_gpuparams(self): + """Tests 1 gpu with fp16, synthetic data, and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True) + self._run_benchmark(params) + + # Test does not run as part of continuous testing. + def benchmark_xla_fake_1gpu_gpuparams(self): + """Tests 1 gpu with fake data and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + def benchmark_xla_real_1gpu_gpuparams(self): + """Tests 1 gpu with real data and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.data_dir, + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + +class TrivialBenchmarks(BenchmarkBase): + """"Benchmarks for trivial model. + + The purpose of these tests is to verify the upper bound for the input + pipeline. Fake data creates an upperbound on the input pipeline throughput. + """ + + def _shared_params(self): + """Returns shared parameters for all trivial benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + model='trivial', + num_gpus=8, + distortions=False, + variable_update='independent', + data_dir=self.fake_data_dir) + + def benchmark_fake_64batch(self): + params = self._shared_params()._replace(batch_size=64, data_name='imagenet') + self._run_benchmark(params) + + def benchmark_fake_128batch(self): + params = self._shared_params()._replace( + batch_size=128, data_name='imagenet') + self._run_benchmark(params) + + def benchmark_fake_256batch(self): + params = self._shared_params()._replace( + batch_size=256, data_name='imagenet') + self._run_benchmark(params) + + def benchmark_fakedistort_128batch(self): + params = self._shared_params()._replace( + batch_size=128, data_name='imagenet', distortions=True) + self._run_benchmark(params) + + +class AlexnetBenchmarks(BenchmarkBase): + """"Benchmarks for alexnet.""" + + def _shared_params(self): + """Returns shared parameters for all alexnet benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + model='alexnet', batch_size=512, distortions=False) + + def benchmark_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data with parameters on gpu.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_fp16_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data with parameters on gpu.""" + params = self._shared_params()._replace( + num_gpus=1, use_fp16=True, variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_synth_8gpu_gpureplicated(self): + """Tests 8 gpus with synthetic data with parameters replicated.""" + params = self._shared_params()._replace( + num_gpus=8, + variable_update='replicated', + all_reduce_spec='nccl', + compact_gradient_transfer=False, + gradient_repacking=2) + self._run_benchmark(params) + + def benchmark_fake_8gpu_gpureplicated(self): + """Tests 8 gpus with fake data with parameters replicated.""" + params = self._shared_params()._replace( + num_gpus=8, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='replicated', + all_reduce_spec='nccl', + compact_gradient_transfer=False, + gradient_repacking=2) + self._run_benchmark(params) + + # XLA Benchmark tests for AlexNet. + def benchmark_xla_synth_1gpuparams(self): + """Tests 1 gpu with synthetic data and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_synth_1gpu_gpuparams(self): + """Tests 1 gpu with fp16, synthetic data and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True) + self._run_benchmark(params) + + # Test does not run as part of continuous testing. + def benchmark_xla_fake_1gpuparams(self): + """Tests 1 gpu with fake data and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + def benchmark_xla_real_1gpuparams(self): + """Tests 1 gpu with real data and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.data_dir, + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + +class InceptionV3Benchmarks(BenchmarkBase): + """"Benchmark for InceptionV3.""" + + def _shared_params(self): + """Returns shared parameters for all InceptionV3 benchmarks.""" + return BenchmarkBase._shared_params(self)._replace( + model='inception3', batch_size=64, distortions=False) + + def benchmark_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_fp16_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic data.""" + params = self._shared_params()._replace( + num_gpus=1, use_fp16=True, variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_synth_1gpu_max_batch_size(self): + """Finds largest batch size that can be run with 1 gpu using synth data.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server') + self._binary_search_batch_size(params, init_batch_size=128) + + def benchmark_xla_synth_1gpu_gpuparams(self): + """Tests 1 gpu with synthetic and XLA.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_synth_1gpu_gpuparams(self): + """Tests 1 gpu with fp16, XLA and synthetic data.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True) + self._run_benchmark(params) + + def benchmark_xla_synth_1gpu_max_batch_size(self): + """Finds largest batch that can be run with XLA, 1 gpu, and synth data.""" + params = self._shared_params()._replace( + num_gpus=1, variable_update='parameter_server', xla=True) + self._binary_search_batch_size(params, init_batch_size=128) + + # Test does not run as part of continuous testing. + def benchmark_xla_fake_1gpu_gpuparams(self): + """Tests 1 gpu with fake data with XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.fake_data_dir, + data_name='imagenet', + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + def benchmark_xla_real_1gpu_gpuparams(self): + """Tests 1 gpu with real data with XLA.""" + params = self._shared_params()._replace( + num_gpus=1, + data_dir=self.data_dir, + variable_update='parameter_server', + xla=True) + self._run_benchmark(params) + + +class NcfBenchmarks(BenchmarkBase): + """Benchmarks for neural collaborative filtering.""" + + def _shared_params(self): + return BenchmarkBase._shared_params(self)._replace( + model='ncf', batch_size=64*1024, num_gpus=1, num_warmup_batches=1) + + def benchmark_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace(variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_fp16_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace( + variable_update='parameter_server', use_fp16=True) + self._run_benchmark(params) + + def benchmark_xla_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace( + variable_update='parameter_server', xla=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace( + variable_update='parameter_server', xla=True, use_fp16=True) + self._run_benchmark(params) + + def benchmark_xla_compile_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace( + variable_update='parameter_server', xla_compile=True) + self._run_benchmark(params) + + def benchmark_fp16_xla_compile_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace( + variable_update='parameter_server', xla_compile=True, use_fp16=True) + self._run_benchmark(params) + + +class DeepSpeech2Benchmarks(BenchmarkBase): + """Benchmarks for DeepSpeech2 model.""" + + def _shared_params(self): + return BenchmarkBase._shared_params(self)._replace( + model='deepspeech2', batch_size=32, num_gpus=1, data_name='librispeech') + + def benchmark_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace(variable_update='parameter_server') + self._run_benchmark(params) + + def benchmark_xla_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace( + variable_update='parameter_server', xla=True) + self._run_benchmark(params) + + def benchmark_xla_compile_synth_1gpu_gpuparams(self): + params = self._shared_params()._replace( + variable_update='parameter_server', xla_compile=True) + self._run_benchmark(params) + + +class SsdBenchmarks(BenchmarkBase): + """Benchmarks for SSD model.""" + + def _cudnn_version(self): + if sys.platform == 'win32': + return None + + lib = ctypes.cdll.LoadLibrary(None) + if hasattr(lib, 'cudnnGetErrorString'): + version = lib.cudnnGetVersion() + return version + + return None + + def _shared_params(self): + cudnn_version = self._cudnn_version() + if cudnn_version is None or cudnn_version < 7300: + raise RuntimeError( + 'Needs at least cuDNN 7.3 to work with fp16 (b/112048183). ' + 'Build with --define=use_experimental_cudnn=1') + + return BenchmarkBase._shared_params(self)._replace( + # TODO(b/115672206): Replace backbone model and data dir with replicated + # placer location for better performance. + backbone_model_path=platforms_util.get_ssd_backborn_model_file(), # pylint: disable=line-too-long + data_dir=platforms_util.get_ssd_backboard_data_dir(), + batch_size=128, + data_name='coco', + model='ssd300', + num_batches=10, + num_warmup_batches=1, + num_gpus=1, + optimizer='momentum', + momentum=0.9, + weight_decay=5e-4, + loss_type_to_report='base_loss', + single_l2_loss_op=True, + compute_lr_on_cpu=True, + ) + + def benchmark_xla_compile_real_1gpu_gpuparams(self): + params = self._shared_params()._replace( + num_gpus=1, + xla_compile=True, + ) + self._run_benchmark(params) + + def benchmark_real_1gpu_gpuparams(self): + params = self._shared_params()._replace(num_gpus=1,) + self._run_benchmark(params) + + def benchmark_xla_compile_fp16_real_1gpu_gpuparams(self): + params = self._shared_params()._replace( + num_gpus=1, xla_compile=True, use_fp16=True) + self._run_benchmark(params) + + def benchmark_fp16_real_1gpu_gpuparams(self): + params = self._shared_params()._replace(num_gpus=1, use_fp16=True) + self._run_benchmark(params) + + def benchmark_xla_compile_real_8gpu_gpuparams(self): + params = self._shared_params()._replace( + num_gpus=8, + xla_compile=True, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2, + num_batches=50, + ) + self._run_benchmark(params) + + def benchmark_real_8gpu_gpuparams(self): + params = self._shared_params()._replace( + num_gpus=8, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2, + num_batches=50, + ) + self._run_benchmark(params) + + def benchmark_xla_compile_fp16_real_8gpu_gpuparams(self): + params = self._shared_params()._replace( + num_gpus=8, + xla_compile=True, + use_fp16=True, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2, + num_batches=50, + ) + self._run_benchmark(params) + + def benchmark_fp16_real_8gpu_gpuparams(self): + params = self._shared_params()._replace( + num_gpus=8, + use_fp16=True, + variable_update='replicated', + all_reduce_spec='nccl', + gradient_repacking=2, + num_batches=50, + ) + self._run_benchmark(params) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/mlperf.py b/cv/classification/resnet50/tensorflow/mlperf.py new file mode 100644 index 0000000000000000000000000000000000000000..932f3136e1b5d4abb5afefebaf3c9512a7b0ca15 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/mlperf.py @@ -0,0 +1,260 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains functions related to MLPerf compliance. + +MLPerf requires submissions to log what the benchmark does, in order to verify +that the benchmark meets the MLPerf requirements. This module contains a global +object `logger` that is used by other files to log what tf_cnn_benchmarks does +for compliance. + +By default, `logger` does nothing, as the MLPerf compliance logs are verbose and +unnecessary if one is not concerned about MLPerf compliance. The logger can be +enabled by using the `mlperf_logger` context manager. + +To enable the logger with `mlperf_logger`, the MLPerf compliance library at +https://github.com/mlperf/training/tree/master/compliance is required. If +the logger is not enabled, the library is not needed. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +from collections import namedtuple +import contextlib +import os +import sys + +import tensorflow.compat.v1 as tf + +# pylint: disable=g-import-not-at-top +try: + # Not all users have the MLPerf compliance library, so we don't want to + # unconditionally crash if these imports fail. + from mlperf_compliance import mlperf_log + from mlperf_compliance import resnet_log_helper + from mlperf_compliance import tags + import_successful = True +except ImportError: + # The logger cannot be enabled in this case since the MLPerf library isn't + # found. We return empty strings from the `tags` attribute so that + # the benchmark can still run without crashing. This empty tags are passed + # to an instance of `NullMlPerfLogger`, which does not log anything and + # ignores the tag values. + + class _Tags(object): + + def __getattr__(self, item): + return '' + tags = _Tags() + import_successful = False +# pylint: enable=g-import-not-at-top + + +_ModelInfo = namedtuple('_ModelInfo', ['print_fn', 'tag_set', + 'mlperf_model_name']) + + +_MLPERF_LOG_PREFIX = ':::MLPv0.5.0' + + +class MlPerfLogger(object): + """Logs various aspects about a benchmark run for MLPerf compliance.""" + + def __init__(self, model): + self._root_dir = os.path.split(os.path.abspath(__file__))[0] + mlperf_log.ROOT_DIR_RESNET = self._root_dir + mlperf_log.ROOT_DIR_SSD = self._root_dir + self.model = model + model_to_info = { + 'resnet50_v1.5': _ModelInfo(mlperf_log.resnet_print, + mlperf_log.RESNET_TAG_SET, tags.RESNET), + 'ssd300': _ModelInfo(mlperf_log.ssd_print, mlperf_log.SSD_TAG_SET, + tags.SSD) + } + + try: + self._log_fn, self.tag_set, self.mlperf_model_name = model_to_info[model] + except KeyError: + raise ValueError('--ml_perf_compliance_logging is only compatible when ' + '--model is one of the following: ' + + ', '.join(model_to_info.keys())) + + def log(self, key, value=None, stack_offset=2): + if key in self.tag_set: + self._log_fn(key, value, stack_offset) + else: + print('Ignoring MLPerf logging item key=%s, value=%s for model %s' % + (key, value, self.model)) + + def log_deferred_tensor_value(self, key, tensor_value, global_step, + stack_offset=2, every_n=1): + """Logs the value of a tensor when the graph is run.""" + caller = '(%s)' % mlperf_log.get_caller(stack_offset, self._root_dir) + def create_print_op(): + return tf.print(_MLPERF_LOG_PREFIX, self.mlperf_model_name, + tf.timestamp(), caller, key, + ': { "deferred": true, "value":', tensor_value, '}', + output_stream=sys.stdout) + maybe_print = tf.cond(tf.equal(global_step % every_n, 0), create_print_op, + tf.no_op) + with tf.control_dependencies([maybe_print]): + return tf.identity(tensor_value) + + def log_max_pool(self, input_tensor, output_tensor): + if self.model == 'resnet50_v1.5': + resnet_log_helper.log_max_pool(input_tensor, output_tensor) + + def log_begin_block(self, input_tensor, block_type): + if self.model == 'resnet50_v1.5': + resnet_log_helper.log_begin_block(input_tensor, block_type) + + def log_end_block(self, output_tensor): + if self.model == 'resnet50_v1.5': + resnet_log_helper.log_end_block(output_tensor) + + def log_projection(self, input_tensor, output_tensor): + if self.model == 'resnet50_v1.5': + resnet_log_helper.log_projection(input_tensor, output_tensor) + + def log_conv2d(self, input_tensor, output_tensor, stride_height, stride_width, + filters, initializer, use_bias): + """Log a conv2d call.""" + if self.model == 'resnet50_v1.5': + assert stride_height == stride_width, ( + '--ml_perf_compliance_logging does not support convolutions where ' + 'the stride height is not equal to the stride width. ' + 'stride_height=%d, stride_width=%d' % (stride_height, stride_width)) + if isinstance(initializer, tf.truncated_normal_initializer) or ( + isinstance(initializer, tf.variance_scaling_initializer) and + initializer.distribution == 'truncated_normal'): + initializer = tags.TRUNCATED_NORMAL + elif (isinstance(initializer, tf.glorot_uniform_initializer) or + initializer is None): + initializer = 'glorot_uniform' + resnet_log_helper.log_conv2d(input_tensor, output_tensor, stride_width, + filters, initializer, use_bias) + + def log_batch_norm(self, input_tensor, output_tensor, momentum, epsilon, + center, scale, training): + if self.model == 'resnet50_v1.5': + resnet_log_helper.log_batch_norm(input_tensor, output_tensor, momentum, + epsilon, center, scale, training) + + def log_train_epochs(self, num_epochs): + """Logs all the TRAIN_EPOCHs log lines.""" + num_epochs_int = int(num_epochs) + for i in range(num_epochs_int): + # MLPerf allows us to print all the train epochs at once instead of + # printing them as we do them. + self.log(key=mlperf_log.TRAIN_EPOCH, value=i, stack_offset=3) + if num_epochs_int != num_epochs: + value = (str(num_epochs_int) + + ', but this epoch only has {}% of the examples of a normal epoch' + .format(100 * (num_epochs - num_epochs_int))) + self.log(key=mlperf_log.TRAIN_EPOCH, value=value, stack_offset=3) + + def log_input_resize_aspect_preserving(self, height, width, scale_factor): + assert height == width, ( + '--ml_perf_compliance_logging does not support models with nonsquare ' + 'images. Cannot process image with height=%d and width=%d' % + (height, width)) + self.log(key=tags.INPUT_RESIZE_ASPECT_PRESERVING, + value={'min': int(height * scale_factor)}) + + def log_eval_epoch(self, tag, global_step, batch_size, stack_offset=2): + if self.model == 'resnet50_v1.5': + self.log(key=tag, stack_offset=stack_offset+1) + elif self.model == 'ssd300': + epoch = int(global_step * batch_size / 118287) + self.log(key=tag, value=epoch, stack_offset=stack_offset+1) + + def log_eval_accuracy(self, accuracy, global_step, batch_size, + examples_per_epoch, stack_offset=2): + """Logs eval accuracy.""" + epoch = int(global_step * batch_size / examples_per_epoch) + eval_accuracy = {'epoch': epoch, 'value': accuracy} + eval_iteration_accuracy = {'iteration': global_step, 'value': accuracy} + self.log(key=tags.EVAL_ACCURACY, value=eval_accuracy, + stack_offset=stack_offset+1) + self.log(key=tags.EVAL_ITERATION_ACCURACY, + value=eval_iteration_accuracy, + stack_offset=stack_offset+1) + + +def _empty_fn(*args, **kwargs): + del args, kwargs + + +class NullMlPerfLogger(object): + """A version of `MlPerfLogger` that does not log anything. + + This class has the same interface as `MlPerfLogger`, but does not actually do + anything. This is used when logging is disabled, which is the default + behavior. + """ + + def __getattr__(self, item): + return _empty_fn + + def log_deferred_tensor_value(self, key, tensor_value, *args, **kwargs): + del key, args, kwargs + return tensor_value + + +# A global singleton logger. By default, it's the null logger but can be +# switched to an MlPerfLogger with `mlperf_logger()`. +logger = NullMlPerfLogger() + + +@contextlib.contextmanager +def mlperf_logger(use_mlperf_logger, model): + """Optionally enable the mlperf logger. + + If `use_mlperf_logger` is True, sets the `logger` global variable to an + instance of MlPerfLogger that will print logs for MLPerf compliance. If + `use_mlperf_logger` is False, does nothing. + + Args: + use_mlperf_logger: If True, enables the mlperf logger. If False, this + function does nothing. + model: The model that will be logged. Required, because different models + must log different things for MLPerf compliance. + + Yields: + Nothing. + + Raises: + ImportError: If `use_mlperf_logger` is True but the MLPerf compliance + library cannot be imported + """ + global logger + if use_mlperf_logger: + if not import_successful: + raise ImportError('Failed to import MLPerf compliance library, which is ' + 'required when --ml_perf_compliance_logging is ' + 'specified. Clone this repo and add this directory ' + 'https://github.com/mlperf/training/tree/master/' + 'compliance to the PYTHONPATH environmental variable.') + logger_ = MlPerfLogger(model) + old_logger = logger + try: + logger = logger_ + yield + finally: + logger = old_logger + else: + yield diff --git a/cv/classification/resnet50/tensorflow/mlperf_test.py b/cv/classification/resnet50/tensorflow/mlperf_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7e83fc29603580b24466c22db2de3732f3d6c13e --- /dev/null +++ b/cv/classification/resnet50/tensorflow/mlperf_test.py @@ -0,0 +1,189 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains tests related to MLPerf. + +Note this test only passes if the MLPerf compliance library is installed. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import Counter +import logging +import re + +import six +import tensorflow.compat.v1 as tf +import benchmark_cnn +import datasets +import mlperf +import test_util +from models import model +from mlperf_compliance import mlperf_log + + +class _MlPerfTestModel(model.CNNModel): + """A model to test the MLPerf compliance logging on.""" + + def __init__(self): + super(_MlPerfTestModel, self).__init__( + 'mlperf_test_model', image_size=224, batch_size=2, learning_rate=1) + + def add_inference(self, cnn): + assert cnn.top_layer.shape[1:] == (3, 224, 224) + cnn.conv(1, 1, 1, 1, 1, use_batch_norm=True) + cnn.mpool(1, 1, 1, 1, num_channels_in=1) + cnn.reshape([-1, 224 * 224]) + cnn.affine(1, activation=None) + + # Assert that the batch norm variables are filtered out for L2 loss. + variables = tf.global_variables() + tf.local_variables() + assert len(variables) > len(self.filter_l2_loss_vars(variables)) + + +class MlPerfComplianceTest(tf.test.TestCase): + """Tests the MLPerf compliance logs. + + This serves as a quick check that we probably didn't break the compliance + logging. It is not mean to be as comprehensive as the official MLPerf + compliance checker will be. + """ + + def setUp(self): + super(MlPerfComplianceTest, self).setUp() + benchmark_cnn.setup(benchmark_cnn.make_params()) + + # Map between regex and the number of times we expect to see that regex in the + # logs. Entry commented out with the comment FIXME indicate that + # tf_cnn_benchmarks currently fails compliance in that regard, and needs to be + # fixed to be MLPerf compliant. + EXPECTED_LOG_REGEXES = { + # Preprocessing tags + mlperf.tags.INPUT_ORDER: 2, # 1 for training, 1 for eval + # We pass --tf_random_seed=9876 in the test. + r'%s: 9876' % mlperf.tags.RUN_SET_RANDOM_SEED: 2, + # The Numpy random seed is hardcoded to 4321. + r'%s: 4321' % mlperf.tags.RUN_SET_RANDOM_SEED: 2, + r'%s: %d' % (mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES, + datasets.IMAGENET_NUM_TRAIN_IMAGES): 1, + r'%s: %d' % (mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES, + datasets.IMAGENET_NUM_VAL_IMAGES): 1, + mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES + '.*': 1, + mlperf.tags.INPUT_DISTORTED_CROP_MIN_OBJ_COV + '.*': 1, + mlperf.tags.INPUT_DISTORTED_CROP_RATIO_RANGE + '.*': 1, + mlperf.tags.INPUT_DISTORTED_CROP_AREA_RANGE + '.*': 1, + mlperf.tags.INPUT_DISTORTED_CROP_MAX_ATTEMPTS + '.*': 1, + mlperf.tags.INPUT_RANDOM_FLIP + '.*': 1, + r'%s: \[224, 224\].*' % mlperf.tags.INPUT_CENTRAL_CROP: 1, + + r'%s: \[123.68, 116.78, 103.94\].*' % mlperf.tags.INPUT_MEAN_SUBTRACTION: + 2, + + r'%s: {"min": 256}.*' % mlperf.tags.INPUT_RESIZE_ASPECT_PRESERVING: 1, + + # 1 for training, 1 for eval + r'%s: \[224, 224\].*' % mlperf.tags.INPUT_RESIZE: 2, + + # Resnet model tags + mlperf.tags.MODEL_HP_BATCH_NORM + '.*': 2, + # 2 for training, 2 for eval. Although there's only 1 conv2d, each conv2d + # produces 2 logs. + mlperf.tags.MODEL_HP_CONV2D_FIXED_PADDING + '.*': 4, + mlperf.tags.MODEL_HP_RELU + '.*': 2, + mlperf.tags.MODEL_HP_INITIAL_MAX_POOL + '.*': 2, + mlperf.tags.MODEL_HP_DENSE + '.*': 4, + mlperf.tags.MODEL_HP_DENSE + '.*': 4, + + # Note that tags our test model does not emit, like MODEL_HP_SHORTCUT_ADD, + # are omitted here. + + r'%s: "categorical_cross_entropy".*' % mlperf.tags.MODEL_HP_LOSS_FN: 1, + + # 1 for training, 2 because the _MlPerfTestModel calls this when building + # the model for both training and eval + r'%s: true' % mlperf.tags.MODEL_EXCLUDE_BN_FROM_L2: 3, + + r'%s: 0.5.*' % mlperf.tags.MODEL_L2_REGULARIZATION: 1, + + # Note we do not handle OPT_LR, since that is printed to stderr using + # tf.Print, which we cannot easily intercept. + + # Other tags + '%s: "%s"' % (mlperf.tags.OPT_NAME, mlperf.tags.SGD_WITH_MOMENTUM): 1, + '%s: 0.5' % mlperf.tags.OPT_MOMENTUM: 1, + mlperf.tags.RUN_START: 1, + '%s: 2' % mlperf.tags.INPUT_BATCH_SIZE: 1, + mlperf.tags.TRAIN_LOOP: 1, + mlperf.tags.TRAIN_EPOCH + '.*': 1, + '%s: 2' % mlperf.tags.INPUT_SIZE: 2, + mlperf.tags.EVAL_START: 2, + mlperf.tags.EVAL_STOP: 2, + '%s: 6' % mlperf.tags.EVAL_SIZE: 2, + mlperf.tags.EVAL_ACCURACY + '.*': 2, + '%s: 2.0' % mlperf.tags.EVAL_TARGET: 2, + mlperf.tags.RUN_STOP + '.*': 1, + mlperf.tags.RUN_FINAL: 1 + } + EXPECTED_LOG_REGEXES = Counter({re.compile(k): v for + k, v in EXPECTED_LOG_REGEXES.items()}) + + def testMlPerfCompliance(self): + string_io = six.StringIO() + handler = logging.StreamHandler(string_io) + data_dir = test_util.create_black_and_white_images() + try: + mlperf_log.LOGGER.addHandler(handler) + params = benchmark_cnn.make_params(data_dir=data_dir, + data_name='imagenet', + batch_size=2, + num_warmup_batches=0, + num_batches=2, + num_eval_batches=3, + eval_during_training_every_n_steps=1, + distortions=False, + weight_decay=0.5, + optimizer='momentum', + momentum=0.5, + stop_at_top_1_accuracy=2.0, + tf_random_seed=9876, + ml_perf=True) + with mlperf.mlperf_logger(use_mlperf_logger=True, model='resnet50_v1.5'): + bench_cnn = benchmark_cnn.BenchmarkCNN(params, model=_MlPerfTestModel()) + bench_cnn.run() + logs = string_io.getvalue().splitlines() + log_regexes = Counter() + for log in logs: + for regex in self.EXPECTED_LOG_REGEXES: + if regex.search(log): + log_regexes[regex] += 1 + if log_regexes != self.EXPECTED_LOG_REGEXES: + diff_counter = Counter(log_regexes) + diff_counter.subtract(self.EXPECTED_LOG_REGEXES) + differences = [] + for regex in (k for k in diff_counter.keys() if diff_counter[k]): + found_count = log_regexes[regex] + expected_count = self.EXPECTED_LOG_REGEXES[regex] + differences.append(' For regex %s: Found %d lines matching but ' + 'expected to find %d' % + (regex.pattern, found_count, expected_count)) + raise AssertionError('Logs did not match expected logs. Differences:\n' + '%s' % '\n'.join(differences)) + finally: + mlperf_log.LOGGER.removeHandler(handler) + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/models/__init__.py b/cv/classification/resnet50/tensorflow/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/classification/resnet50/tensorflow/models/alexnet_model.py b/cv/classification/resnet50/tensorflow/models/alexnet_model.py new file mode 100644 index 0000000000000000000000000000000000000000..2f4611fd60d19a3dd704e47323e7fa9a5320f596 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/alexnet_model.py @@ -0,0 +1,93 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Alexnet model configuration. + +References: + Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton + ImageNet Classification with Deep Convolutional Neural Networks + Advances in Neural Information Processing Systems. 2012 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf +from models import model + + +class AlexnetModel(model.CNNModel): + """Alexnet cnn model.""" + + def __init__(self, params=None): + super(AlexnetModel, self).__init__( + 'alexnet', 224 + 3, 512, 0.005, params=params) + + def add_inference(self, cnn): + # Note: VALID requires padding the images by 3 in width and height + cnn.conv(64, 11, 11, 4, 4, 'VALID') + cnn.mpool(3, 3, 2, 2) + cnn.conv(192, 5, 5) + cnn.mpool(3, 3, 2, 2) + cnn.conv(384, 3, 3) + cnn.conv(384, 3, 3) + cnn.conv(256, 3, 3) + cnn.mpool(3, 3, 2, 2) + cnn.reshape([-1, 256 * 6 * 6]) + cnn.affine(4096) + cnn.dropout() + cnn.affine(4096) + cnn.dropout() + + +class AlexnetCifar10Model(model.CNNModel): + """Alexnet cnn model for cifar datasets. + + The model architecture follows the one defined in the tensorflow tutorial + model. + + Reference model: tensorflow/models/tutorials/image/cifar10/cifar10.py + Paper: http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf + """ + + def __init__(self, params=None): + super(AlexnetCifar10Model, self).__init__( + 'alexnet', 32, 128, 0.1, params=params) + + def add_inference(self, cnn): + cnn.conv(64, 5, 5, 1, 1, 'SAME', stddev=5e-2) + cnn.mpool(3, 3, 2, 2, mode='SAME') + cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) + cnn.conv(64, 5, 5, 1, 1, 'SAME', bias=0.1, stddev=5e-2) + cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) + cnn.mpool(3, 3, 2, 2, mode='SAME') + shape = cnn.top_layer.get_shape().as_list() + flat_dim = shape[1] * shape[2] * shape[3] + cnn.reshape([-1, flat_dim]) + cnn.affine(384, stddev=0.04, bias=0.1) + cnn.affine(192, stddev=0.04, bias=0.1) + + def get_learning_rate(self, global_step, batch_size): + num_examples_per_epoch = 50000 + num_epochs_per_decay = 100 + decay_steps = ( + num_epochs_per_decay * num_examples_per_epoch // batch_size) + decay_factor = 0.1 + return tf.train.exponential_decay( + self.learning_rate, + global_step, + decay_steps, + decay_factor, + staircase=True) diff --git a/cv/classification/resnet50/tensorflow/models/densenet_model.py b/cv/classification/resnet50/tensorflow/models/densenet_model.py new file mode 100644 index 0000000000000000000000000000000000000000..cb61b9b3f3332587daa2e308ba6d722cba408e1b --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/densenet_model.py @@ -0,0 +1,100 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Densenet model configuration. + +References: + "Densely Connected Convolutional Networks": https://arxiv.org/pdf/1608.06993 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow.compat.v1 as tf +from models import model as model_lib + + +class DensenetCifar10Model(model_lib.CNNModel): + """Densenet cnn network configuration.""" + + def __init__(self, model, layer_counts, growth_rate, params=None): + self.growth_rate = growth_rate + super(DensenetCifar10Model, self).__init__( + model, 32, 64, 0.1, layer_counts=layer_counts, params=params) + self.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True} + + def dense_block(self, cnn, growth_rate): + input_layer = cnn.top_layer + c = cnn.batch_norm(input_layer, **self.batch_norm_config) + c = tf.nn.relu(c) + c = cnn.conv(growth_rate, 3, 3, 1, 1, stddev=np.sqrt(2.0/9/growth_rate), + activation=None, input_layer=c) + channel_index = 3 if cnn.channel_pos == 'channels_last' else 1 + cnn.top_layer = tf.concat([input_layer, c], channel_index) + cnn.top_size += growth_rate + + def transition_layer(self, cnn): + in_size = cnn.top_size + cnn.batch_norm(**self.batch_norm_config) + cnn.top_layer = tf.nn.relu(cnn.top_layer) + cnn.conv(in_size, 1, 1, 1, 1, stddev=np.sqrt(2.0/9/in_size)) + cnn.apool(2, 2, 2, 2) + + def add_inference(self, cnn): + if self.layer_counts is None: + raise ValueError('Layer counts not specified for %s' % self.get_model()) + if self.growth_rate is None: + raise ValueError('Growth rate not specified for %s' % self.get_model()) + + cnn.conv(16, 3, 3, 1, 1, activation=None) + # Block 1 + for _ in xrange(self.layer_counts[0]): + self.dense_block(cnn, self.growth_rate) + self.transition_layer(cnn) + # Block 2 + for _ in xrange(self.layer_counts[1]): + self.dense_block(cnn, self.growth_rate) + self.transition_layer(cnn) + # Block 3 + for _ in xrange(self.layer_counts[2]): + self.dense_block(cnn, self.growth_rate) + cnn.batch_norm(**self.batch_norm_config) + cnn.top_layer = tf.nn.relu(cnn.top_layer) + channel_index = 3 if cnn.channel_pos == 'channels_last' else 1 + cnn.top_size = cnn.top_layer.get_shape().as_list()[channel_index] + cnn.spatial_mean() + + def get_learning_rate(self, global_step, batch_size): + num_batches_per_epoch = 50000 // batch_size + boundaries = num_batches_per_epoch * np.array([150, 225, 300], + dtype=np.int64) + boundaries = [x for x in boundaries] + values = [0.1, 0.01, 0.001, 0.0001] + return tf.train.piecewise_constant(global_step, boundaries, values) + + +def create_densenet40_k12_model(): + return DensenetCifar10Model('densenet40_k12', (12, 12, 12), 12) + + +def create_densenet100_k12_model(): + return DensenetCifar10Model('densenet100_k12', (32, 32, 32), 12) + + +def create_densenet100_k24_model(): + return DensenetCifar10Model('densenet100_k24', (32, 32, 32), 24) diff --git a/cv/classification/resnet50/tensorflow/models/experimental/__init__.py b/cv/classification/resnet50/tensorflow/models/experimental/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/classification/resnet50/tensorflow/models/experimental/deepspeech.py b/cv/classification/resnet50/tensorflow/models/experimental/deepspeech.py new file mode 100644 index 0000000000000000000000000000000000000000..24e242f6db9d113a718194df3f9aca45a03da886 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/experimental/deepspeech.py @@ -0,0 +1,449 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""DeepSpeech2 model configuration. + +References: + https://arxiv.org/abs/1512.02595 + Deep Speech 2: End-to-End Speech Recognition in English and Mandarin +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import itertools + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow.compat.v1 as tf +import constants +from cnn_util import log_fn +from models import model as model_lib +from tensorflow.python.ops import variables # pylint: disable=g-direct-tensorflow-import + + +class DeepSpeechDecoder(object): + """Greedy decoder implementation for Deep Speech model.""" + + def __init__(self, labels, blank_index=28): + """Decoder initialization. + + Arguments: + labels: a string specifying the speech labels for the decoder to use. + blank_index: an integer specifying index for the blank character. Defaults + to 28. + """ + self.labels = labels + self.blank_index = blank_index + self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)]) + + def convert_to_string(self, sequence): + """Convert a sequence of indexes into corresponding string.""" + return ''.join([self.int_to_char[i] for i in sequence]) + + def wer(self, decode, target): + """Computes the Word Error Rate (WER). + + WER is defined as the edit distance between the two provided sentences after + tokenizing to words. + + Args: + decode: string of the decoded output. + target: a string for the ground truth label. + + Returns: + A float number for the WER of the current decode-target pair. + """ + try: + from nltk.metrics import distance # pylint: disable=g-import-not-at-top + except ImportError as e: + if 'nltk.metrics' not in e.message: + raise + raise ImportError('To use the experimental deepspeech model, you must ' + 'pip install -U nltk') + + # Map each word to a new char. + words = set(decode.split() + target.split()) + word2char = dict(zip(words, range(len(words)))) + + new_decode = [chr(word2char[w]) for w in decode.split()] + new_target = [chr(word2char[w]) for w in target.split()] + + return distance.edit_distance(''.join(new_decode), ''.join(new_target)) + + def cer(self, decode, target): + """Computes the Character Error Rate (CER). + + CER is defined as the edit distance between the two given strings. + + Args: + decode: a string of the decoded output. + target: a string for the ground truth label. + + Returns: + A float number denoting the CER for the current sentence pair. + """ + try: + from nltk.metrics import distance # pylint: disable=g-import-not-at-top + except ImportError as e: + if 'nltk.metrics' not in e.message: + raise + raise ImportError('To use the experimental deepspeech model, you must ' + 'pip install -U nltk') + return distance.edit_distance(decode, target) + + def decode(self, char_indexes): + """Decode the best guess from logits using greedy algorithm.""" + # Merge repeated chars. + merge = [k for k, _ in itertools.groupby(char_indexes)] + # Remove the blank index in the decoded sequence. + merge_remove_blank = [] + for k in merge: + if k != self.blank_index: + merge_remove_blank.append(k) + + return self.convert_to_string(merge_remove_blank) + + def decode_logits(self, logits): + """Decode the best guess from logits using greedy algorithm.""" + # Choose the class with maximimum probability. + best = list(np.argmax(logits, axis=1)) + return self.decode(best) + + +class DeepSpeech2Model(model_lib.Model): + """Define DeepSpeech2 model.""" + + # Supported rnn cells. + SUPPORTED_RNNS = { + 'lstm': tf.nn.rnn_cell.BasicLSTMCell, + 'rnn': tf.nn.rnn_cell.RNNCell, + 'gru': tf.nn.rnn_cell.GRUCell, + } + + # Parameters for batch normalization. + BATCH_NORM_EPSILON = 1e-5 + BATCH_NORM_DECAY = 0.997 + + # Filters of convolution layer + CONV_FILTERS = 32 + + def __init__(self, + num_rnn_layers=5, + rnn_type='lstm', + is_bidirectional=True, + rnn_hidden_size=800, + use_bias=True, + params=None): + """Initialize DeepSpeech2 model. + + Args: + num_rnn_layers: an integer, the number of rnn layers (default: 5). + rnn_type: a string, one of the supported rnn cells: gru, rnn or lstm. + is_bidirectional: a boolean to indicate if the rnn layer is bidirectional. + rnn_hidden_size: an integer for the number of hidden units in the RNN + cell. + use_bias: a boolean specifying whether to use a bias in the last fc layer. + params: the params from BenchmarkCNN. + """ + super(DeepSpeech2Model, self).__init__( + 'deepspeech2', + batch_size=128, + learning_rate=0.0005, + fp16_loss_scale=128, + params=params) + self.num_rnn_layers = num_rnn_layers + self.rnn_type = rnn_type + self.is_bidirectional = is_bidirectional + self.rnn_hidden_size = rnn_hidden_size + self.use_bias = use_bias + self.num_feature_bins = 161 + self.max_time_steps = 3494 + self.max_label_length = 576 + + def _batch_norm(self, inputs, training): + """Batch normalization layer. + + Note that the momentum to use will affect validation accuracy over time. + Batch norm has different behaviors during training/evaluation. With a large + momentum, the model takes longer to get a near-accurate estimation of the + moving mean/variance over the entire training dataset, which means we need + more iterations to see good evaluation results. If the training data is + evenly distributed over the feature space, we can also try setting a smaller + momentum (such as 0.1) to get good evaluation result sooner. + + Args: + inputs: input data for batch norm layer. + training: a boolean to indicate if it is in training stage. + + Returns: + tensor output from batch norm layer. + """ + return tf.layers.batch_normalization( + inputs=inputs, + momentum=DeepSpeech2Model.BATCH_NORM_DECAY, + epsilon=DeepSpeech2Model.BATCH_NORM_EPSILON, + fused=True, + training=training) + + def _conv_bn_layer(self, inputs, padding, filters, kernel_size, strides, + layer_id, training): + """Defines 2D convolutional + batch normalization layer. + + Args: + inputs: input data for convolution layer. + padding: padding to be applied before convolution layer. + filters: an integer, number of output filters in the convolution. + kernel_size: a tuple specifying the height and width of the 2D convolution + window. + strides: a tuple specifying the stride length of the convolution. + layer_id: an integer specifying the layer index. + training: a boolean to indicate which stage we are in (training/eval). + + Returns: + tensor output from the current layer. + """ + # Perform symmetric padding on the feature dimension of time_step + # This step is required to avoid issues when RNN output sequence is shorter + # than the label length. + inputs = tf.pad( + inputs, + [[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]]) + inputs = tf.layers.conv2d( + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding='valid', + use_bias=False, + activation=tf.nn.relu6, + name='cnn_{}'.format(layer_id)) + return self._batch_norm(inputs, training) + + def _rnn_layer(self, inputs, rnn_cell, rnn_hidden_size, layer_id, + use_batch_norm, is_bidirectional, training): + """Defines a batch normalization + rnn layer. + + Args: + inputs: input tensors for the current layer. + rnn_cell: RNN cell instance to use. + rnn_hidden_size: an integer for the dimensionality of the rnn output + space. + layer_id: an integer for the index of current layer. + use_batch_norm: a boolean specifying whether to perform batch + normalization on input states. + is_bidirectional: a boolean specifying whether the rnn layer is + bi-directional. + training: a boolean to indicate which stage we are in (training/eval). + + Returns: + tensor output for the current layer. + """ + if use_batch_norm: + inputs = self._batch_norm(inputs, training) + + # Construct forward/backward RNN cells. + fw_cell = rnn_cell( + num_units=rnn_hidden_size, name='rnn_fw_{}'.format(layer_id)) + + if is_bidirectional: + bw_cell = rnn_cell( + num_units=rnn_hidden_size, name='rnn_bw_{}'.format(layer_id)) + outputs, _ = tf.nn.bidirectional_dynamic_rnn( + cell_fw=fw_cell, + cell_bw=bw_cell, + inputs=inputs, + dtype=tf.float32, + swap_memory=True) + rnn_outputs = tf.concat(outputs, -1) + else: + rnn_outputs = tf.nn.dynamic_rnn( + fw_cell, inputs, dtype=tf.float32, swap_memory=True) + + return rnn_outputs + + def get_input_data_types(self, subset): + """Returns the list of data types of the inputs.""" + del subset # Same data types for both train and validation subsets. + return [self.data_type, tf.int32, tf.int32, tf.int32] + + def get_input_shapes(self, subset): + """Returns the list of shapes of the padded inputs.""" + del subset # Same shapes for both train and validation subsets + return [ + [self.batch_size, self.max_time_steps, self.num_feature_bins, 1], + [self.batch_size, self.max_label_length], + [self.batch_size, 1], + [self.batch_size, 1], + ] + + def get_synthetic_inputs(self, input_name, nclass): + inputs = tf.random_uniform(self.get_input_shapes('train')[0], + dtype=self.get_input_data_types('train')[0]) + inputs = variables.VariableV1(inputs, trainable=False, + collections=[tf.GraphKeys.LOCAL_VARIABLES], + name=input_name) + labels = tf.convert_to_tensor( + np.random.randint(28, size=[self.batch_size, self.max_label_length])) + input_lengths = tf.convert_to_tensor( + [self.max_time_steps] * self.batch_size) + label_lengths = tf.convert_to_tensor( + [self.max_label_length] * self.batch_size) + return [inputs, labels, input_lengths, label_lengths] + + # TODO(laigd): support fp16. + # TODO(laigd): support multiple gpus. + def build_network(self, inputs, phase_train=True, nclass=29): + """Builds the forward pass of the deepspeech2 model. + + Args: + inputs: The input list of the model. + phase_train: True during training. False during evaluation. + nclass: Number of classes that the input spectrogram can belong to. + + Returns: + A BuildNetworkResult which contains the logits and model-specific extra + information. + """ + inputs = inputs[0] # Get the spectrogram feature. + + # Two cnn layers. + inputs = self._conv_bn_layer( + inputs, + padding=(20, 5), + filters=DeepSpeech2Model.CONV_FILTERS, + kernel_size=(41, 11), + strides=(2, 2), + layer_id=1, + training=phase_train) + + inputs = self._conv_bn_layer( + inputs, + padding=(10, 5), + filters=DeepSpeech2Model.CONV_FILTERS, + kernel_size=(21, 11), + strides=(2, 1), + layer_id=2, + training=phase_train) + + # output of conv_layer2 with the shape of + # [batch_size (N), times (T), features (F), channels (C)]. + # Convert the conv output to rnn input. + + # batch_size = tf.shape(inputs)[0] + feat_size = inputs.get_shape().as_list()[2] + inputs = tf.reshape( + inputs, + [self.batch_size, -1, feat_size * DeepSpeech2Model.CONV_FILTERS]) + + # RNN layers. + rnn_cell = DeepSpeech2Model.SUPPORTED_RNNS[self.rnn_type] + for layer_counter in xrange(self.num_rnn_layers): + # No batch normalization on the first layer. + use_batch_norm = (layer_counter != 0) + inputs = self._rnn_layer(inputs, rnn_cell, self.rnn_hidden_size, + layer_counter + 1, use_batch_norm, + self.is_bidirectional, phase_train) + + # FC layer with batch norm. + inputs = self._batch_norm(inputs, phase_train) + logits = tf.layers.dense(inputs, nclass, use_bias=self.use_bias) + + return model_lib.BuildNetworkResult(logits=logits, extra_info=None) + + def loss_function(self, inputs, build_network_result): + """Computes the ctc loss for the current batch of predictions. + + Args: + inputs: the input list of the model. + build_network_result: a BuildNetworkResult returned by build_network(). + + Returns: + The loss tensor of the model. + """ + logits = build_network_result.logits + actual_time_steps = inputs[2] + probs = tf.nn.softmax(logits) + ctc_time_steps = tf.shape(probs)[1] + ctc_input_length = tf.to_float( + tf.multiply(actual_time_steps, ctc_time_steps)) + ctc_input_length = tf.to_int32( + tf.floordiv(ctc_input_length, tf.to_float(self.max_time_steps))) + + label_length = inputs[3] + label_length = tf.to_int32(tf.squeeze(label_length)) + ctc_input_length = tf.to_int32(tf.squeeze(ctc_input_length)) + + labels = inputs[1] + sparse_labels = tf.to_int32( + tf.keras.backend.ctc_label_dense_to_sparse(labels, label_length)) + y_pred = tf.log( + tf.transpose(probs, perm=[1, 0, 2]) + tf.keras.backend.epsilon()) + + losses = tf.expand_dims( + tf.nn.ctc_loss( + labels=sparse_labels, + inputs=y_pred, + sequence_length=ctc_input_length, + ignore_longer_outputs_than_inputs=True), + axis=1) + loss = tf.reduce_mean(losses) + return loss + + PROBABILITY_TENSOR = 'deepspeech2_prob' + LABEL_TENSOR = 'deepspeech2_label' + + def accuracy_function(self, inputs, logits): + """Returns the ops to evaluate the model performance.""" + # Get probabilities of each predicted class + probs = tf.nn.softmax(logits) + assert probs.shape.as_list()[0] == self.batch_size + return { + (constants.UNREDUCED_ACCURACY_OP_PREFIX + self.PROBABILITY_TENSOR): + probs, + (constants.UNREDUCED_ACCURACY_OP_PREFIX + self.LABEL_TENSOR): + inputs[1], + } + + def postprocess(self, results): + """Postprocess results returned from model in Python.""" + probs = results[self.PROBABILITY_TENSOR] + + total_wer, total_cer = 0, 0 + speech_labels = " abcdefghijklmnopqrstuvwxyz'-" + greedy_decoder = DeepSpeechDecoder(speech_labels) + + # Evaluate the performance using WER (Word Error Rate) and CER (Character + # Error Rate) as metrics. + targets = results[self.LABEL_TENSOR] # The ground truth transcript + for i in range(self.batch_size): + # Decode string. + predicted_str = greedy_decoder.decode_logits(probs[i]) + expected_str = greedy_decoder.decode(targets[i]) + # Compute CER. + total_cer += (greedy_decoder.cer(predicted_str, expected_str) / + len(expected_str)) + # Compute WER. + total_wer += (greedy_decoder.wer(predicted_str, expected_str) / + len(expected_str.split())) + + # Get mean value + total_cer /= self.batch_size + total_wer /= self.batch_size + + log_fn('total CER: {:f}; total WER: {:f}; total example: {:d}.'.format( + total_cer, total_wer, self.batch_size)) + # TODO(laigd): get rid of top_N_accuracy bindings in benchmark_cnn.py + return {'top_1_accuracy': 0., 'top_5_accuracy': 0.} diff --git a/cv/classification/resnet50/tensorflow/models/experimental/official_ncf_model.py b/cv/classification/resnet50/tensorflow/models/experimental/official_ncf_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9e6ca513f9f0c3f9b7c67bc7a072ed0b35fd4f5a --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/experimental/official_ncf_model.py @@ -0,0 +1,172 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Wrap the official recommendation model in a tf_cnn_benchmarks Model. + +This allows the recommendation NCF model to be used in tf_cnn_benchmarks. +Currently, the implementation is fairly hacky, because tf_cnn_benchmarks is +intended to be used only with CNNs. + +Only synthetic data with 1 GPU is currently supported. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf + +from models import model + + +# Obtained by running the official NCF model with the following command: +# python ncf_main.py --dataset ml-20m +# and printing the number of users and items here: +# https://github.com/tensorflow/models/blob/d089975f630a8a01be63e45ef08a31be14bb96b4/official/recommendation/data_preprocessing.py#L68 +_NUM_USERS_20M = 138493 +_NUM_ITEMS_20M = 26744 + + +# TODO(reedwm): Support multi-GPU. Currently keras layers, which this model +# uses, ignore variable_scopes, which we rely on for multi-GPU support. +# TODO(reedwm): Support real data. This will require a significant refactor. +# TODO(reedwm): All-reduce IndexedSlices more effectively. +# TODO(reedwm): Support the 1M variant of this model. + + +class NcfModel(model.Model): + r"""A model.Model wrapper around the official NCF recommendation model. + + To do an NCF run with synthetic data that roughly matches what the official + model does, run: + + python tf_cnn_benchmarks.py --optimizer=adam --model=ncf --batch_size=65536 \ + --weight_decay=0 --sparse_to_dense_grads + """ + + def __init__(self, params=None): + super(NcfModel, self).__init__( + 'official_ncf', batch_size=2048, learning_rate=0.0005, + fp16_loss_scale=128, params=params) + if self.fp16_vars: + raise ValueError('NCF model only supports float32 variables for now.') + + def build_network(self, inputs, phase_train=True, nclass=1001): + try: + from official.recommendation import neumf_model # pylint: disable=g-import-not-at-top + except ImportError as e: + if 'neumf_model' not in e.message: + raise + raise ImportError('To use the experimental NCF model, you must clone the ' + 'repo https://github.com/tensorflow/models and add ' + 'tensorflow/models to the PYTHONPATH.') + del nclass + + users, items, _ = inputs + params = { + 'num_users': _NUM_USERS_20M, + 'num_items': _NUM_ITEMS_20M, + 'model_layers': (256, 256, 128, 64), + 'mf_dim': 64, + 'mf_regularization': 0, + 'mlp_reg_layers': (0, 0, 0, 0), + 'use_tpu': False + } + user_input = tf.keras.layers.Input(tensor=users, name='user_input') + item_input = tf.keras.layers.Input(tensor=items, name='item_input') + if self.data_type == tf.float32: + keras_model = neumf_model.construct_model(user_input, item_input, params) + logits = keras_model.output + else: + assert self.data_type == tf.float16 + old_floatx = tf.keras.backend.floatx() + try: + tf.keras.backend.set_floatx('float16') + # We cannot rely on the variable_scope's fp16 custom getter here, + # because the NCF model uses keras layers, which ignore variable scopes. + # So we use a variable_creator_scope instead. + with tf.variable_creator_scope(_fp16_variable_creator): + keras_model = neumf_model.construct_model(user_input, item_input, + params) + logits = tf.cast(keras_model.output, tf.float32) + finally: + tf.keras.backend.set_floatx(old_floatx) + return model.BuildNetworkResult(logits=logits, extra_info=None) + + def loss_function(self, inputs, build_network_result): + logits = build_network_result.logits + + # Softmax with the first column of ones is equivalent to sigmoid. + # TODO(reedwm): Actually, the first column should be zeros to be equivalent + # to sigmoid. But, we keep it at ones to match the official models. + logits = tf.concat([tf.ones(logits.shape, dtype=logits.dtype), logits], + axis=1) + + return tf.losses.sparse_softmax_cross_entropy( + labels=inputs[2], + logits=logits + ) + + def get_synthetic_inputs(self, input_name, nclass): + """Returns the ops to generate synthetic inputs and labels.""" + def users_init_val(): + return tf.random_uniform((self.batch_size, 1), minval=0, + maxval=_NUM_USERS_20M, dtype=tf.int32) + users = tf.Variable(users_init_val, dtype=tf.int32, trainable=False, + collections=[tf.GraphKeys.LOCAL_VARIABLES], + name='synthetic_users') + def items_init_val(): + return tf.random_uniform((self.batch_size, 1), minval=0, + maxval=_NUM_ITEMS_20M, dtype=tf.int32) + items = tf.Variable(items_init_val, dtype=tf.int32, trainable=False, + collections=[tf.GraphKeys.LOCAL_VARIABLES], + name='synthetic_items') + + def labels_init_val(): + return tf.random_uniform((self.batch_size,), minval=0, maxval=2, + dtype=tf.int32) + labels = tf.Variable(labels_init_val, dtype=tf.int32, trainable=False, + collections=[tf.GraphKeys.LOCAL_VARIABLES], + name='synthetic_labels') + + return [users, items, labels] + + def get_input_shapes(self, subset): + del subset + return [[self.batch_size, 1], [self.batch_size, 1], [self.batch_size]] + + def get_input_data_types(self, subset): + del subset + return [self.int32, tf.int32, tf.int32] + + +def _fp16_variable_creator(next_creator, **kwargs): + """Variable creator to create variables in fp32 and cast them to fp16.""" + dtype = kwargs.get('dtype', None) + initial_value = kwargs.get('initial_value', None) + if dtype is None: + if initial_value is not None and not callable(initial_value): + dtype = initial_value.dtype + if dtype == tf.float16: + if callable(initial_value): + new_initial_value = lambda: tf.cast(initial_value(), tf.float32) + else: + new_initial_value = tf.cast(initial_value, tf.float32) + kwargs['dtype'] = tf.float32 + kwargs['initial_value'] = new_initial_value + var = next_creator(**kwargs) + return tf.cast(var, dtype=tf.float16) + else: + return next_creator(**kwargs) + diff --git a/cv/classification/resnet50/tensorflow/models/googlenet_model.py b/cv/classification/resnet50/tensorflow/models/googlenet_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3505594ec933cc05cb96b00eeac81cbc4334693c --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/googlenet_model.py @@ -0,0 +1,63 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Googlenet model configuration. + +References: + Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, + Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich + Going deeper with convolutions + arXiv preprint arXiv:1409.4842 (2014) +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from models import model + + +class GooglenetModel(model.CNNModel): + """GoogLeNet.""" + + def __init__(self, params=None): + super(GooglenetModel, self).__init__( + 'googlenet', 224, 32, 0.005, params=params) + + def add_inference(self, cnn): + + def inception_v1(cnn, k, l, m, n, p, q): + cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)], + [('conv', n, 1, 1), ('conv', p, 5, 5)], + [('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]] + cnn.inception_module('incept_v1', cols) + + cnn.conv(64, 7, 7, 2, 2) + cnn.mpool(3, 3, 2, 2, mode='SAME') + cnn.conv(64, 1, 1) + cnn.conv(192, 3, 3) + cnn.mpool(3, 3, 2, 2, mode='SAME') + inception_v1(cnn, 64, 96, 128, 16, 32, 32) + inception_v1(cnn, 128, 128, 192, 32, 96, 64) + cnn.mpool(3, 3, 2, 2, mode='SAME') + inception_v1(cnn, 192, 96, 208, 16, 48, 64) + inception_v1(cnn, 160, 112, 224, 24, 64, 64) + inception_v1(cnn, 128, 128, 256, 24, 64, 64) + inception_v1(cnn, 112, 144, 288, 32, 64, 64) + inception_v1(cnn, 256, 160, 320, 32, 128, 128) + cnn.mpool(3, 3, 2, 2, mode='SAME') + inception_v1(cnn, 256, 160, 320, 32, 128, 128) + inception_v1(cnn, 384, 192, 384, 48, 128, 128) + cnn.apool(7, 7, 1, 1, mode='VALID') + cnn.reshape([-1, 1024]) diff --git a/cv/classification/resnet50/tensorflow/models/inception_model.py b/cv/classification/resnet50/tensorflow/models/inception_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b8835edb88cb57fde2b67bc8cb5fb2caffa0527f --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/inception_model.py @@ -0,0 +1,213 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Inception model configuration. + +Includes multiple models: inception3, inception4, inception-resnet2. + +References: + Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi + Inception-v4, Inception-ResNet and the Impact of Residual Connections on + Learning + + Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, + Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich + Going Deeper with Convolutions + http://arxiv.org/pdf/1409.4842v1.pdf + + Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, + Zbigniew Wojna + Rethinking the Inception Architecture for Computer Vision + arXiv preprint arXiv:1512.00567 (2015) + + Inception v3 model: http://arxiv.org/abs/1512.00567 + + Inception v4 and Resnet V2 architectures: http://arxiv.org/abs/1602.07261 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from six.moves import xrange # pylint: disable=redefined-builtin +from models import model + + +class Inceptionv3Model(model.CNNModel): + """InceptionV3.""" + + def __init__(self, auxiliary=False, params=None): + self._auxiliary = auxiliary + super(Inceptionv3Model, self).__init__( + 'inception3', 299, 32, 0.005, params=params) + + def add_inference(self, cnn): + def inception_v3_a(cnn, n): + cols = [[('conv', 64, 1, 1)], [('conv', 48, 1, 1), ('conv', 64, 5, 5)], + [('conv', 64, 1, 1), ('conv', 96, 3, 3), ('conv', 96, 3, 3)], + [('apool', 3, 3, 1, 1, 'SAME'), ('conv', n, 1, 1)]] + cnn.inception_module('incept_v3_a', cols) + + def inception_v3_b(cnn): + cols = [[('conv', 384, 3, 3, 2, 2, 'VALID')], + [('conv', 64, 1, 1), + ('conv', 96, 3, 3), + ('conv', 96, 3, 3, 2, 2, 'VALID')], + [('mpool', 3, 3, 2, 2, 'VALID')]] + cnn.inception_module('incept_v3_b', cols) + + def inception_v3_c(cnn, n): + cols = [[('conv', 192, 1, 1)], + [('conv', n, 1, 1), ('conv', n, 1, 7), ('conv', 192, 7, 1)], + [('conv', n, 1, 1), ('conv', n, 7, 1), ('conv', n, 1, 7), + ('conv', n, 7, 1), ('conv', 192, 1, 7)], + [('apool', 3, 3, 1, 1, 'SAME'), ('conv', 192, 1, 1)]] + cnn.inception_module('incept_v3_c', cols) + + def inception_v3_d(cnn): + cols = [[('conv', 192, 1, 1), ('conv', 320, 3, 3, 2, 2, 'VALID')], + [('conv', 192, 1, 1), ('conv', 192, 1, 7), ('conv', 192, 7, 1), + ('conv', 192, 3, 3, 2, 2, 'VALID')], + [('mpool', 3, 3, 2, 2, 'VALID')]] + cnn.inception_module('incept_v3_d', cols) + + def inception_v3_e(cnn, pooltype): + cols = [[('conv', 320, 1, 1)], [('conv', 384, 1, 1), ('conv', 384, 1, 3)], + [('share',), ('conv', 384, 3, 1)], + [('conv', 448, 1, 1), ('conv', 384, 3, 3), ('conv', 384, 1, 3)], + [('share',), ('share',), ('conv', 384, 3, 1)], + [('mpool' if pooltype == 'max' else 'apool', 3, 3, 1, 1, 'SAME'), + ('conv', 192, 1, 1)]] + cnn.inception_module('incept_v3_e', cols) + + def incept_v3_aux(cnn): + assert cnn.aux_top_layer is None + cnn.aux_top_layer = cnn.top_layer + cnn.aux_top_size = cnn.top_size + with cnn.switch_to_aux_top_layer(): + cnn.apool(5, 5, 3, 3, mode='VALID') + cnn.conv(128, 1, 1, mode='SAME') + cnn.conv(768, 5, 5, mode='VALID', stddev=0.01) + cnn.reshape([-1, 768]) + + cnn.use_batch_norm = True + cnn.conv(32, 3, 3, 2, 2, mode='VALID') # 299 x 299 x 3 + cnn.conv(32, 3, 3, 1, 1, mode='VALID') # 149 x 149 x 32 + cnn.conv(64, 3, 3, 1, 1, mode='SAME') # 147 x 147 x 64 + cnn.mpool(3, 3, 2, 2, mode='VALID') # 147 x 147 x 64 + cnn.conv(80, 1, 1, 1, 1, mode='VALID') # 73 x 73 x 80 + cnn.conv(192, 3, 3, 1, 1, mode='VALID') # 71 x 71 x 192 + cnn.mpool(3, 3, 2, 2, 'VALID') # 35 x 35 x 192 + inception_v3_a(cnn, 32) # 35 x 35 x 256 mixed. + inception_v3_a(cnn, 64) # 35 x 35 x 288 mixed_1. + inception_v3_a(cnn, 64) # 35 x 35 x 288 mixed_2 + inception_v3_b(cnn) # 17 x 17 x 768 mixed_3 + inception_v3_c(cnn, 128) # 17 x 17 x 768 mixed_4 + inception_v3_c(cnn, 160) # 17 x 17 x 768 mixed_5 + inception_v3_c(cnn, 160) # 17 x 17 x 768 mixed_6 + inception_v3_c(cnn, 192) # 17 x 17 x 768 mixed_7 + if self._auxiliary: + incept_v3_aux(cnn) # Auxillary Head logits + inception_v3_d(cnn) # 17 x 17 x 1280 mixed_8 + inception_v3_e(cnn, 'avg') # 8 x 8 x 2048 mixed_9 + inception_v3_e(cnn, 'max') # 8 x 8 x 2048 mixed_10 + cnn.apool(8, 8, 1, 1, 'VALID') # 8 x 8 x 2048 + cnn.reshape([-1, 2048]) # 1 x 1 x 2048 + + +# Stem functions +def inception_v4_sa(cnn): + cols = [[('mpool', 3, 3, 2, 2, 'VALID')], [('conv', 96, 3, 3, 2, 2, 'VALID')]] + cnn.inception_module('incept_v4_sa', cols) + + +def inception_v4_sb(cnn): + cols = [[('conv', 64, 1, 1), ('conv', 96, 3, 3, 1, 1, 'VALID')], + [('conv', 64, 1, 1), ('conv', 64, 7, 1), ('conv', 64, 1, 7), + ('conv', 96, 3, 3, 1, 1, 'VALID')]] + cnn.inception_module('incept_v4_sb', cols) + + +def inception_v4_sc(cnn): + cols = [[('conv', 192, 3, 3, 2, 2, 'VALID')], + [('mpool', 3, 3, 2, 2, 'VALID')]] + cnn.inception_module('incept_v4_sc', cols) + + +# Reduction functions +def inception_v4_ra(cnn, k, l, m, n): + cols = [ + [('mpool', 3, 3, 2, 2, 'VALID')], [('conv', n, 3, 3, 2, 2, 'VALID')], + [('conv', k, 1, 1), ('conv', l, 3, 3), ('conv', m, 3, 3, 2, 2, 'VALID')] + ] + cnn.inception_module('incept_v4_ra', cols) + + +def inception_v4_rb(cnn): + cols = [[('mpool', 3, 3, 2, 2, 'VALID')], + [('conv', 192, 1, 1), ('conv', 192, 3, 3, 2, 2, 'VALID')], + [('conv', 256, 1, 1), ('conv', 256, 1, 7), ('conv', 320, 7, 1), + ('conv', 320, 3, 3, 2, 2, 'VALID')]] + cnn.inception_module('incept_v4_rb', cols) + + +class Inceptionv4Model(model.CNNModel): + """Inceptionv4.""" + + def __init__(self, params=None): + super(Inceptionv4Model, self).__init__( + 'inception4', 299, 32, 0.005, params=params) + + def add_inference(self, cnn): + def inception_v4_a(cnn): + cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 96, 1, 1)], + [('conv', 96, 1, 1)], [('conv', 64, 1, 1), ('conv', 96, 3, 3)], + [('conv', 64, 1, 1), ('conv', 96, 3, 3), ('conv', 96, 3, 3)]] + cnn.inception_module('incept_v4_a', cols) + + def inception_v4_b(cnn): + cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 128, 1, 1)], + [('conv', 384, 1, 1)], + [('conv', 192, 1, 1), ('conv', 224, 1, 7), ('conv', 256, 7, 1)], + [('conv', 192, 1, 1), ('conv', 192, 1, 7), ('conv', 224, 7, 1), + ('conv', 224, 1, 7), ('conv', 256, 7, 1)]] + cnn.inception_module('incept_v4_b', cols) + + def inception_v4_c(cnn): + cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 256, 1, 1)], + [('conv', 256, 1, 1)], [('conv', 384, 1, 1), ('conv', 256, 1, 3)], + [('share',), ('conv', 256, 3, 1)], + [('conv', 384, 1, 1), ('conv', 448, 1, 3), ('conv', 512, 3, 1), + ('conv', 256, 3, 1)], [('share',), ('share',), ('share',), + ('conv', 256, 1, 3)]] + cnn.inception_module('incept_v4_c', cols) + + cnn.use_batch_norm = True + cnn.conv(32, 3, 3, 2, 2, mode='VALID') + cnn.conv(32, 3, 3, 1, 1, mode='VALID') + cnn.conv(64, 3, 3) + inception_v4_sa(cnn) + inception_v4_sb(cnn) + inception_v4_sc(cnn) + for _ in xrange(4): + inception_v4_a(cnn) + inception_v4_ra(cnn, 192, 224, 256, 384) + for _ in xrange(7): + inception_v4_b(cnn) + inception_v4_rb(cnn) + for _ in xrange(3): + inception_v4_c(cnn) + cnn.spatial_mean() + cnn.dropout(0.8) diff --git a/cv/classification/resnet50/tensorflow/models/lenet_model.py b/cv/classification/resnet50/tensorflow/models/lenet_model.py new file mode 100644 index 0000000000000000000000000000000000000000..0218daaeb2b016b7bfcc886af813e92aee25f521 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/lenet_model.py @@ -0,0 +1,44 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Lenet model configuration. + +References: + LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner + Gradient-based learning applied to document recognition + Proceedings of the IEEE (1998) +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from models import model + + +class Lenet5Model(model.CNNModel): + """Lenet5.""" + + def __init__(self, params=None): + super(Lenet5Model, self).__init__('lenet5', 28, 32, 0.005, params=params) + + def add_inference(self, cnn): + # Note: This matches TF's MNIST tutorial model + cnn.conv(32, 5, 5) + cnn.mpool(2, 2) + cnn.conv(64, 5, 5) + cnn.mpool(2, 2) + cnn.reshape([-1, 64 * 7 * 7]) + cnn.affine(512) diff --git a/cv/classification/resnet50/tensorflow/models/model.py b/cv/classification/resnet50/tensorflow/models/model.py new file mode 100644 index 0000000000000000000000000000000000000000..3db13081917f9582704428c6c26956cbd652ae77 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/model.py @@ -0,0 +1,340 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Base model configuration for CNN benchmarks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import namedtuple + +import tensorflow.compat.v1 as tf + +import convnet_builder +import mlperf +from tensorflow.python.ops import variables as variables_module # pylint: disable=g-direct-tensorflow-import + +# BuildNetworkResult encapsulate the result (e.g. logits) of a +# Model.build_network() call. +BuildNetworkResult = namedtuple( + 'BuildNetworkResult', + [ + 'logits', # logits of the network + 'extra_info', # Model specific extra information + ]) + + +class Model(object): + """Base model config for DNN benchmarks.""" + + def __init__(self, + model_name, + batch_size, + learning_rate, + fp16_loss_scale, + params=None): + self.model_name = model_name + self.batch_size = batch_size + self.default_batch_size = batch_size + self.learning_rate = learning_rate + # TODO(reedwm) Set custom loss scales for each model instead of using the + # default of 128. + self.fp16_loss_scale = fp16_loss_scale + + # use_tf_layers specifies whether to build the model using tf.layers. + # fp16_vars specifies whether to create the variables in float16. + if params: + self.use_tf_layers = params.use_tf_layers + self.fp16_vars = params.fp16_vars + self.data_type = tf.float16 if params.use_fp16 else tf.float32 + else: + self.use_tf_layers = True + self.fp16_vars = False + self.data_type = tf.float32 + + def get_model_name(self): + return self.model_name + + def get_batch_size(self): + return self.batch_size + + def set_batch_size(self, batch_size): + self.batch_size = batch_size + + def get_default_batch_size(self): + return self.default_batch_size + + def get_fp16_loss_scale(self): + return self.fp16_loss_scale + + def filter_l2_loss_vars(self, variables): + """Filters out variables that the L2 loss should not be computed for. + + By default, this filters out batch normalization variables and keeps all + other variables. This behavior can be overridden by subclasses. + + Args: + variables: A list of the trainable variables. + + Returns: + A list of variables that the L2 loss should be computed for. + """ + mlperf.logger.log(key=mlperf.tags.MODEL_EXCLUDE_BN_FROM_L2, + value=True) + return [v for v in variables if 'batchnorm' not in v.name] + + def get_learning_rate(self, global_step, batch_size): + del global_step + del batch_size + return self.learning_rate + + def get_input_shapes(self, subset): + """Returns the list of expected shapes of all the inputs to this model.""" + del subset + raise NotImplementedError('Must be implemented in derived classes') + + def get_input_data_types(self, subset): + """Returns the list of data types of all the inputs to this model.""" + del subset + raise NotImplementedError('Must be implemented in derived classes') + + def get_synthetic_inputs(self, input_name, nclass): + """Returns the ops to generate synthetic inputs.""" + raise NotImplementedError('Must be implemented in derived classes') + + def build_network(self, inputs, phase_train, nclass): + """Builds the forward pass of the model. + + Args: + inputs: The list of inputs, including labels + phase_train: True during training. False during evaluation. + nclass: Number of classes that the inputs can belong to. + + Returns: + A BuildNetworkResult which contains the logits and model-specific extra + information. + """ + raise NotImplementedError('Must be implemented in derived classes') + + def loss_function(self, inputs, build_network_result): + """Returns the op to measure the loss of the model. + + Args: + inputs: the input list of the model. + build_network_result: a BuildNetworkResult returned by build_network(). + + Returns: + The loss tensor of the model. + """ + raise NotImplementedError('Must be implemented in derived classes') + + # TODO(laigd): have accuracy_function() take build_network_result instead. + def accuracy_function(self, inputs, logits): + """Returns the ops to measure the accuracy of the model.""" + raise NotImplementedError('Must be implemented in derived classes') + + def postprocess(self, results): + """Postprocess results returned from model in Python.""" + return results + + def reached_target(self): + """Define custom methods to stop training when model's target is reached.""" + return False + + +class CNNModel(Model): + """Base model configuration for CNN benchmarks.""" + + # TODO(laigd): reduce the number of parameters and read everything from + # params. + def __init__(self, + model, + image_size, + batch_size, + learning_rate, + layer_counts=None, + fp16_loss_scale=128, + params=None): + super(CNNModel, self).__init__( + model, batch_size, learning_rate, fp16_loss_scale, + params=params) + self.image_size = image_size + self.layer_counts = layer_counts + self.depth = 3 + self.params = params + self.data_format = params.data_format if params else 'NCHW' + + def get_layer_counts(self): + return self.layer_counts + + def skip_final_affine_layer(self): + """Returns if the caller of this class should skip the final affine layer. + + Normally, this class adds a final affine layer to the model after calling + self.add_inference(), to generate the logits. If a subclass override this + method to return True, the caller should not add the final affine layer. + + This is useful for tests. + """ + return False + + def add_backbone_saver(self): + """Creates a tf.train.Saver as self.backbone_saver for loading backbone. + + A tf.train.Saver must be created and saved in self.backbone_saver before + calling load_backbone_model, with correct variable name mapping to load + variables from checkpoint correctly into the current model. + """ + raise NotImplementedError(self.getName() + ' does not have backbone model.') + + def load_backbone_model(self, sess, backbone_model_path): + """Loads variable values from a pre-trained backbone model. + + This should be used at the beginning of the training process for transfer + learning models using checkpoints of base models. + + Args: + sess: session to train the model. + backbone_model_path: path to backbone model checkpoint file. + """ + del sess, backbone_model_path + raise NotImplementedError(self.getName() + ' does not have backbone model.') + + def add_inference(self, cnn): + """Adds the core layers of the CNN's forward pass. + + This should build the forward pass layers, except for the initial transpose + of the images and the final Dense layer producing the logits. The layers + should be build with the ConvNetBuilder `cnn`, so that when this function + returns, `cnn.top_layer` and `cnn.top_size` refer to the last layer and the + number of units of the layer layer, respectively. + + Args: + cnn: A ConvNetBuilder to build the forward pass layers with. + """ + del cnn + raise NotImplementedError('Must be implemented in derived classes') + + def get_input_data_types(self, subset): + """Return data types of inputs for the specified subset.""" + del subset # Same types for both 'train' and 'validation' subsets. + return [self.data_type, tf.int32] + + def get_input_shapes(self, subset): + """Return data shapes of inputs for the specified subset.""" + del subset # Same shapes for both 'train' and 'validation' subsets. + # Each input is of shape [batch_size, height, width, depth] + # Each label is of shape [batch_size] + return [[self.batch_size, self.image_size, self.image_size, self.depth], + [self.batch_size]] + + def get_synthetic_inputs(self, input_name, nclass): + # Synthetic input should be within [0, 255]. + image_shape, label_shape = self.get_input_shapes('train') + inputs = tf.truncated_normal( + image_shape, + dtype=self.data_type, + mean=127, + stddev=60, + name=self.model_name + '_synthetic_inputs') + inputs = variables_module.VariableV1( + inputs, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES], + name=input_name) + labels = tf.random_uniform( + label_shape, + minval=0, + maxval=nclass - 1, + dtype=tf.int32, + name=self.model_name + '_synthetic_labels') + return (inputs, labels) + + def gpu_preprocess_nhwc(self, images, phase_train=True): + del phase_train + return images + + def build_network(self, + inputs, + phase_train=True, + nclass=1001): + """Returns logits from input images. + + Args: + inputs: The input images and labels + phase_train: True during training. False during evaluation. + nclass: Number of classes that the images can belong to. + + Returns: + A BuildNetworkResult which contains the logits and model-specific extra + information. + """ + images = inputs[0] + images = self.gpu_preprocess_nhwc(images, phase_train) + if self.data_format == 'NCHW': + images = tf.transpose(images, [0, 3, 1, 2]) + var_type = tf.float32 + if self.data_type == tf.float16 and self.fp16_vars: + var_type = tf.float16 + network = convnet_builder.ConvNetBuilder( + images, self.depth, phase_train, self.use_tf_layers, self.data_format, + self.data_type, var_type) + with tf.variable_scope('cg', custom_getter=network.get_custom_getter()): + self.add_inference(network) + # Add the final fully-connected class layer + logits = ( + network.affine(nclass, activation='linear') + if not self.skip_final_affine_layer() else network.top_layer) + mlperf.logger.log(key=mlperf.tags.MODEL_HP_FINAL_SHAPE, + value=logits.shape.as_list()[1:]) + aux_logits = None + if network.aux_top_layer is not None: + with network.switch_to_aux_top_layer(): + aux_logits = network.affine(nclass, activation='linear', stddev=0.001) + if self.data_type == tf.float16: + # TODO(reedwm): Determine if we should do this cast here. + logits = tf.cast(logits, tf.float32) + if aux_logits is not None: + aux_logits = tf.cast(aux_logits, tf.float32) + return BuildNetworkResult( + logits=logits, extra_info=None if aux_logits is None else aux_logits) + + def loss_function(self, inputs, build_network_result): + """Returns the op to measure the loss of the model.""" + logits = build_network_result.logits + _, labels = inputs + # TODO(laigd): consider putting the aux logit in the Inception model, + # which could call super.loss_function twice, once with the normal logits + # and once with the aux logits. + aux_logits = build_network_result.extra_info + with tf.name_scope('xentropy'): + mlperf.logger.log(key=mlperf.tags.MODEL_HP_LOSS_FN, value=mlperf.tags.CCE) + cross_entropy = tf.losses.sparse_softmax_cross_entropy( + logits=logits, labels=labels) + loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') + if aux_logits is not None: + with tf.name_scope('aux_xentropy'): + aux_cross_entropy = tf.losses.sparse_softmax_cross_entropy( + logits=aux_logits, labels=labels) + aux_loss = 0.4 * tf.reduce_mean(aux_cross_entropy, name='aux_loss') + loss = tf.add_n([loss, aux_loss]) + return loss + + def accuracy_function(self, inputs, logits): + """Returns the ops to measure the accuracy of the model.""" + _, labels = inputs + top_1_op = tf.reduce_sum( + tf.cast(tf.nn.in_top_k(logits, labels, 1), self.data_type)) + top_5_op = tf.reduce_sum( + tf.cast(tf.nn.in_top_k(logits, labels, 5), self.data_type)) + return {'top_1_accuracy': top_1_op, 'top_5_accuracy': top_5_op} diff --git a/cv/classification/resnet50/tensorflow/models/model_config.py b/cv/classification/resnet50/tensorflow/models/model_config.py new file mode 100644 index 0000000000000000000000000000000000000000..1a31dc6233a71f7609668362a24360b74a6e2262 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/model_config.py @@ -0,0 +1,181 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Model configurations for CNN benchmarks. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from functools import partial + +from models import alexnet_model +from models import densenet_model +from models import googlenet_model +from models import inception_model +from models import lenet_model +from models import official_resnet_model +from models import overfeat_model +from models import resnet_model +from models import trivial_model +from models import vgg_model +from models.experimental import deepspeech +from models.experimental import official_ncf_model + + +_model_name_to_imagenet_model = { + 'vgg11': vgg_model.Vgg11Model, + 'vgg16': vgg_model.Vgg16Model, + 'vgg19': vgg_model.Vgg19Model, + 'lenet': lenet_model.Lenet5Model, + 'googlenet': googlenet_model.GooglenetModel, + 'overfeat': overfeat_model.OverfeatModel, + 'alexnet': alexnet_model.AlexnetModel, + 'trivial': trivial_model.TrivialModel, + 'inception3': inception_model.Inceptionv3Model, + 'inception4': inception_model.Inceptionv4Model, + 'official_resnet18_v2': + partial(official_resnet_model.ImagenetResnetModel, 18), + 'official_resnet34_v2': + partial(official_resnet_model.ImagenetResnetModel, 34), + 'official_resnet50_v2': + partial(official_resnet_model.ImagenetResnetModel, 50), + 'official_resnet101_v2': + partial(official_resnet_model.ImagenetResnetModel, 101), + 'official_resnet152_v2': + partial(official_resnet_model.ImagenetResnetModel, 152), + 'official_resnet200_v2': + partial(official_resnet_model.ImagenetResnetModel, 200), + 'official_resnet18': + partial(official_resnet_model.ImagenetResnetModel, 18, version=1), + 'official_resnet34': + partial(official_resnet_model.ImagenetResnetModel, 34, version=1), + 'official_resnet50': + partial(official_resnet_model.ImagenetResnetModel, 50, version=1), + 'official_resnet101': + partial(official_resnet_model.ImagenetResnetModel, 101, version=1), + 'official_resnet152': + partial(official_resnet_model.ImagenetResnetModel, 152, version=1), + 'official_resnet200': + partial(official_resnet_model.ImagenetResnetModel, 200, version=1), + 'resnet50': resnet_model.create_resnet50_model, + 'resnet50_v1.5': resnet_model.create_resnet50_v1_5_model, + 'resnet50_v2': resnet_model.create_resnet50_v2_model, + 'resnet101': resnet_model.create_resnet101_model, + 'resnet101_v2': resnet_model.create_resnet101_v2_model, + 'resnet152': resnet_model.create_resnet152_model, + 'resnet152_v2': resnet_model.create_resnet152_v2_model, + 'ncf': official_ncf_model.NcfModel, +} + + +_model_name_to_cifar_model = { + 'alexnet': alexnet_model.AlexnetCifar10Model, + 'resnet20': resnet_model.create_resnet20_cifar_model, + 'resnet20_v2': resnet_model.create_resnet20_v2_cifar_model, + 'resnet32': resnet_model.create_resnet32_cifar_model, + 'resnet32_v2': resnet_model.create_resnet32_v2_cifar_model, + 'resnet44': resnet_model.create_resnet44_cifar_model, + 'resnet44_v2': resnet_model.create_resnet44_v2_cifar_model, + 'resnet56': resnet_model.create_resnet56_cifar_model, + 'resnet56_v2': resnet_model.create_resnet56_v2_cifar_model, + 'resnet110': resnet_model.create_resnet110_cifar_model, + 'resnet110_v2': resnet_model.create_resnet110_v2_cifar_model, + 'trivial': trivial_model.TrivialCifar10Model, + 'densenet40_k12': densenet_model.create_densenet40_k12_model, + 'densenet100_k12': densenet_model.create_densenet100_k12_model, + 'densenet100_k24': densenet_model.create_densenet100_k24_model, +} + + +_model_name_to_object_detection_model = { + 'trivial': trivial_model.TrivialSSD300Model, +} + + +def _get_model_map(dataset_name): + """Get name to model map for specified dataset.""" + if dataset_name == 'cifar10': + return _model_name_to_cifar_model + elif dataset_name in ('imagenet', 'synthetic', 'imagenette'): + return _model_name_to_imagenet_model + elif dataset_name == 'librispeech': + return {'deepspeech2': deepspeech.DeepSpeech2Model} + elif dataset_name == 'coco': + return _model_name_to_object_detection_model + else: + raise ValueError('Invalid dataset name: %s' % dataset_name) + + +# A model map dict can have this string as a value when TF2 is used, to indicate +# the model is only available in TF1. +_TF1_ONLY_STRING = 'TF1_ONLY' + + +def get_model_config(model_name, dataset, params): + """Map model name to model network configuration.""" + model_map = _get_model_map(dataset.name) + if model_name not in model_map: + raise ValueError('Invalid model name \'%s\' for dataset \'%s\'' % + (model_name, dataset.name)) + model = model_map[model_name](params=params) + if model == 'TF1_ONLY': + raise ValueError('Model \'%s\' can only be used with TensorFlow 1' + % (model_name,)) + return model + + +def register_model(model_name, dataset_name, model_func): + """Register a new model that can be obtained with `get_model_config`.""" + model_map = _get_model_map(dataset_name) + if model_name in model_map: + raise ValueError('Model "%s" is already registered for dataset "%s"' % + (model_name, dataset_name)) + model_map[model_name] = model_func + + +# pylint: disable=g-import-not-at-top +try: + from tensorflow.contrib import slim # pylint: disable=unused-import + can_import_contrib = True +except ImportError: + can_import_contrib = False + + +def register_tf1_models(): + """Registers all the TensorFlow 1-only models. + + TF 1-only models use contrib, which was removed in TF 2. If contrib can be + imported, the TF 1-only models are registered normally. If contrib cannot be + imported, the models are registered with the 'TF1_ONLY' string instead, which + will cause an error to be thrown if these models are used. + """ + if can_import_contrib: + from models.tf1_only import mobilenet_v2 + from models.tf1_only import nasnet_model + from models.tf1_only import ssd_model + register_model('mobilenet', 'imagenet', mobilenet_v2.MobilenetModel) + register_model('nasnet', 'imagenet', nasnet_model.NasnetModel) + register_model('nasnetlarge', 'imagenet', nasnet_model.NasnetLargeModel) + register_model('nasnet', 'cifar10', nasnet_model.NasnetCifarModel) + register_model('ssd300', 'coco', ssd_model.SSD300Model) + else: + register_model('mobilenet', 'imagenet', 'TF1_ONLY') + register_model('nasnet', 'imagenet', 'TF1_ONLY') + register_model('nasnetlarge', 'imagenet', 'TF1_ONLY') + register_model('nasnet', 'cifar10', 'TF1_ONLY') + register_model('ssd300', 'coco', 'TF1_ONLY') + diff --git a/cv/classification/resnet50/tensorflow/models/official_resnet_model.py b/cv/classification/resnet50/tensorflow/models/official_resnet_model.py new file mode 100644 index 0000000000000000000000000000000000000000..a70943c644550fe1a092b20e2c9a9f63cd797623 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/official_resnet_model.py @@ -0,0 +1,77 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Import official resnet models.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf +import datasets +from models import model as model_lib + + +class ImagenetResnetModel(model_lib.CNNModel): + """Official resnet models.""" + + def __init__(self, resnet_size, version=2, params=None): + """These are the parameters that work for Imagenet data. + + Args: + resnet_size: The number of convolutional layers needed in the model. + version: 1 or 2 for v1 or v2, respectively. + params: params passed by BenchmarkCNN. + """ + default_batch_sizes = { + 50: 128, + 101: 32, + 152: 32 + } + batch_size = default_batch_sizes.get(resnet_size, 32) + default_learning_rate = 0.0125 * batch_size / 32 + model_name = 'official_resnet_{}_v{}'.format(resnet_size, version) + super(ImagenetResnetModel, self).__init__( + model_name, 224, batch_size, default_learning_rate, params=params) + self.resnet_size = resnet_size + self.version = version + + def get_learning_rate(self, global_step, batch_size): + num_batches_per_epoch = ( + float(datasets.IMAGENET_NUM_TRAIN_IMAGES) / batch_size) + boundaries = [int(num_batches_per_epoch * x) for x in [30, 60, 80, 90]] + values = [1, 0.1, 0.01, 0.001, 0.0001] + adjusted_learning_rate = ( + self.learning_rate / self.default_batch_size * batch_size) + values = [v * adjusted_learning_rate for v in values] + return tf.train.piecewise_constant(global_step, boundaries, values) + + def build_network(self, images, phase_train=True, nclass=1001, + data_type=tf.float32): + # pylint: disable=g-import-not-at-top + try: + from official.resnet.r1.imagenet_main import ImagenetModel + except ImportError: + tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.') + raise + images = tf.cast(images, data_type) + model_class = ImagenetModel(resnet_size=self.resnet_size, + resnet_version=self.version, + # The official model dtype seems to be ignored, + # as the dtype it uses is the dtype of the input + # images. Doesn't hurt to set it though. + dtype=data_type) + logits = model_class(images, phase_train) + logits = tf.cast(logits, tf.float32) + return model_lib.BuildNetworkResult(logits=logits, extra_info=None) diff --git a/cv/classification/resnet50/tensorflow/models/overfeat_model.py b/cv/classification/resnet50/tensorflow/models/overfeat_model.py new file mode 100644 index 0000000000000000000000000000000000000000..7483bcbf3221f719e31baad4b9c93a4f52b0f629 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/overfeat_model.py @@ -0,0 +1,53 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Overfeat model configuration. + +References: + OverFeat: Integrated Recognition, Localization and Detection using + Convolutional Networks + Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus, + Yann LeCun, 2014 + http://arxiv.org/abs/1312.6229 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from models import model + + +class OverfeatModel(model.CNNModel): + """OverfeatModel.""" + + def __init__(self, params=None): + super(OverfeatModel, self).__init__( + 'overfeat', 231, 32, 0.005, params=params) + + def add_inference(self, cnn): + # Note: VALID requires padding the images by 3 in width and height + cnn.conv(96, 11, 11, 4, 4, mode='VALID') + cnn.mpool(2, 2) + cnn.conv(256, 5, 5, 1, 1, mode='VALID') + cnn.mpool(2, 2) + cnn.conv(512, 3, 3) + cnn.conv(1024, 3, 3) + cnn.conv(1024, 3, 3) + cnn.mpool(2, 2) + cnn.reshape([-1, 1024 * 6 * 6]) + cnn.affine(3072) + cnn.dropout() + cnn.affine(4096) + cnn.dropout() diff --git a/cv/classification/resnet50/tensorflow/models/resnet_model.py b/cv/classification/resnet50/tensorflow/models/resnet_model.py new file mode 100644 index 0000000000000000000000000000000000000000..6340a30b89b661ea884df849e6c0949a2c7b9c86 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/resnet_model.py @@ -0,0 +1,489 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Resnet model configuration. + +References: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition + arXiv:1512.03385 (2015) + + Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Identity Mappings in Deep Residual Networks + arXiv:1603.05027 (2016) + + Liang-Chieh Chen, George Papandreou, Iasonas Kokkinos, Kevin Murphy, + Alan L. Yuille + DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, + Atrous Convolution, and Fully Connected CRFs + arXiv:1606.00915 (2016) +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow.compat.v1 as tf +import datasets +import mlperf +from models import model as model_lib + + +def bottleneck_block_v1(cnn, depth, depth_bottleneck, stride): + """Bottleneck block with identity short-cut for ResNet v1. + + Args: + cnn: the network to append bottleneck blocks. + depth: the number of output filters for this bottleneck block. + depth_bottleneck: the number of bottleneck filters for this block. + stride: Stride used in the first layer of the bottleneck block. + """ + input_layer = cnn.top_layer + in_size = cnn.top_size + name_key = 'resnet_v1' + name = name_key + str(cnn.counts[name_key]) + cnn.counts[name_key] += 1 + + with tf.variable_scope(name): + if depth == in_size: + if stride == 1: + shortcut = input_layer + else: + shortcut = cnn.apool( + 1, 1, stride, stride, input_layer=input_layer, + num_channels_in=in_size) + mlperf.logger.log_projection(input_tensor=input_layer, + output_tensor=shortcut) + else: + shortcut = cnn.conv( + depth, 1, 1, stride, stride, activation=None, + use_batch_norm=True, input_layer=input_layer, + num_channels_in=in_size, bias=None) + cnn.conv(depth_bottleneck, 1, 1, stride, stride, + input_layer=input_layer, num_channels_in=in_size, + use_batch_norm=True, bias=None) + cnn.conv(depth_bottleneck, 3, 3, 1, 1, mode='SAME_RESNET', + use_batch_norm=True, bias=None) + res = cnn.conv(depth, 1, 1, 1, 1, activation=None, + use_batch_norm=True, bias=None) + mlperf.logger.log(key=mlperf.tags.MODEL_HP_SHORTCUT_ADD) + mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU) + output = tf.nn.relu(shortcut + res) + cnn.top_layer = output + cnn.top_size = depth + + +def bottleneck_block_v1_5(cnn, depth, depth_bottleneck, stride): + """Bottleneck block with identity short-cut for ResNet v1.5. + + ResNet v1.5 is the informal name for ResNet v1 where stride 2 is used in the + first 3x3 convolution of each block instead of the first 1x1 convolution. + + First seen at https://github.com/facebook/fb.resnet.torch. Used in the paper + "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour" + (arXiv:1706.02677v2) and by fast.ai to train to accuracy in 45 epochs using + multiple image sizes. + + Args: + cnn: the network to append bottleneck blocks. + depth: the number of output filters for this bottleneck block. + depth_bottleneck: the number of bottleneck filters for this block. + stride: Stride used in the first layer of the bottleneck block. + """ + input_layer = cnn.top_layer + in_size = cnn.top_size + name_key = 'resnet_v1.5' + name = name_key + str(cnn.counts[name_key]) + cnn.counts[name_key] += 1 + + with tf.variable_scope(name): + if depth == in_size: + if stride == 1: + shortcut = input_layer + else: + shortcut = cnn.apool( + 1, 1, stride, stride, input_layer=input_layer, + num_channels_in=in_size) + mlperf.logger.log_projection(input_tensor=input_layer, + output_tensor=shortcut) + else: + shortcut = cnn.conv( + depth, 1, 1, stride, stride, activation=None, + use_batch_norm=True, input_layer=input_layer, + num_channels_in=in_size, bias=None) + mlperf.logger.log_projection(input_tensor=input_layer, + output_tensor=shortcut) + cnn.conv(depth_bottleneck, 1, 1, 1, 1, + input_layer=input_layer, num_channels_in=in_size, + use_batch_norm=True, bias=None) + cnn.conv(depth_bottleneck, 3, 3, stride, stride, mode='SAME_RESNET', + use_batch_norm=True, bias=None) + res = cnn.conv(depth, 1, 1, 1, 1, activation=None, + use_batch_norm=True, bias=None) + mlperf.logger.log(key=mlperf.tags.MODEL_HP_SHORTCUT_ADD) + mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU) + output = tf.nn.relu(shortcut + res) + cnn.top_layer = output + cnn.top_size = depth + + +def bottleneck_block_v2(cnn, depth, depth_bottleneck, stride): + """Bottleneck block with identity short-cut for ResNet v2. + + The main difference from v1 is that a batch norm and relu are done at the + start of the block, instead of the end. This initial batch norm and relu is + collectively called a pre-activation. + + Args: + cnn: the network to append bottleneck blocks. + depth: the number of output filters for this bottleneck block. + depth_bottleneck: the number of bottleneck filters for this block. + stride: Stride used in the first layer of the bottleneck block. + """ + input_layer = cnn.top_layer + in_size = cnn.top_size + name_key = 'resnet_v2' + name = name_key + str(cnn.counts[name_key]) + cnn.counts[name_key] += 1 + + preact = cnn.batch_norm() + mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU) + preact = tf.nn.relu(preact) + with tf.variable_scope(name): + if depth == in_size: + if stride == 1: + shortcut = input_layer + else: + shortcut = cnn.apool( + 1, 1, stride, stride, input_layer=input_layer, + num_channels_in=in_size) + mlperf.logger.log_projection(input_tensor=input_layer, + output_tensor=shortcut) + else: + shortcut = cnn.conv( + depth, 1, 1, stride, stride, activation=None, use_batch_norm=False, + input_layer=preact, num_channels_in=in_size, bias=None) + cnn.conv(depth_bottleneck, 1, 1, stride, stride, + input_layer=preact, num_channels_in=in_size, + use_batch_norm=True, bias=None) + cnn.conv(depth_bottleneck, 3, 3, 1, 1, mode='SAME_RESNET', + use_batch_norm=True, bias=None) + res = cnn.conv(depth, 1, 1, 1, 1, activation=None, + use_batch_norm=False, bias=None) + mlperf.logger.log(key=mlperf.tags.MODEL_HP_SHORTCUT_ADD) + output = shortcut + res + cnn.top_layer = output + cnn.top_size = depth + + +def bottleneck_block(cnn, depth, depth_bottleneck, stride, version): + """Bottleneck block with identity short-cut. + + Args: + cnn: the network to append bottleneck blocks. + depth: the number of output filters for this bottleneck block. + depth_bottleneck: the number of bottleneck filters for this block. + stride: Stride used in the first layer of the bottleneck block. + version: version of ResNet to build. + """ + mlperf.logger.log(key=mlperf.tags.MODEL_HP_BLOCK_TYPE, + value=mlperf.tags.BOTTLENECK_BLOCK) + mlperf.logger.log_begin_block( + input_tensor=cnn.top_layer, block_type=mlperf.tags.BOTTLENECK_BLOCK) + if version == 'v2': + bottleneck_block_v2(cnn, depth, depth_bottleneck, stride) + elif version == 'v1.5': + bottleneck_block_v1_5(cnn, depth, depth_bottleneck, stride) + else: + bottleneck_block_v1(cnn, depth, depth_bottleneck, stride) + mlperf.logger.log_end_block(output_tensor=cnn.top_layer) + + +def residual_block(cnn, depth, stride, version, projection_shortcut=False): + """Residual block with identity short-cut. + + Args: + cnn: the network to append residual blocks. + depth: the number of output filters for this residual block. + stride: Stride used in the first layer of the residual block. + version: version of ResNet to build. + projection_shortcut: indicator of using projection shortcut, even if top + size and depth are equal + """ + pre_activation = True if version == 'v2' else False + input_layer = cnn.top_layer + in_size = cnn.top_size + + if projection_shortcut: + shortcut = cnn.conv( + depth, 1, 1, stride, stride, activation=None, + use_batch_norm=True, input_layer=input_layer, + num_channels_in=in_size, bias=None) + elif in_size != depth: + # Plan A of shortcut. + shortcut = cnn.apool(1, 1, stride, stride, + input_layer=input_layer, + num_channels_in=in_size) + padding = (depth - in_size) // 2 + if cnn.channel_pos == 'channels_last': + shortcut = tf.pad( + shortcut, [[0, 0], [0, 0], [0, 0], [padding, padding]]) + else: + shortcut = tf.pad( + shortcut, [[0, 0], [padding, padding], [0, 0], [0, 0]]) + else: + shortcut = input_layer + if pre_activation: + res = cnn.batch_norm(input_layer) + res = tf.nn.relu(res) + else: + res = input_layer + cnn.conv(depth, 3, 3, stride, stride, + input_layer=res, num_channels_in=in_size, + use_batch_norm=True, bias=None) + if pre_activation: + res = cnn.conv(depth, 3, 3, 1, 1, activation=None, + use_batch_norm=False, bias=None) + output = shortcut + res + else: + res = cnn.conv(depth, 3, 3, 1, 1, activation=None, + use_batch_norm=True, bias=None) + output = tf.nn.relu(shortcut + res) + cnn.top_layer = output + cnn.top_size = depth + + +class ResnetModel(model_lib.CNNModel): + """Resnet cnn network configuration.""" + + def __init__(self, model, layer_counts, params=None): + default_batch_sizes = { + 'resnet50': 64, + 'resnet101': 32, + 'resnet152': 32, + 'resnet50_v1.5': 64, + 'resnet101_v1.5': 32, + 'resnet152_v1.5': 32, + 'resnet50_v2': 64, + 'resnet101_v2': 32, + 'resnet152_v2': 32, + } + batch_size = default_batch_sizes.get(model, 32) + # The ResNet paper uses a starting lr of .1 at bs=256. + self.base_lr_batch_size = 256 + base_lr = 0.128 + if params: + if params.resnet_base_lr: + base_lr = params.resnet_base_lr + if params.use_deep_stem: + self.use_deep_stem = True + else: + self.use_deep_stem = False + super(ResnetModel, self).__init__(model, 224, batch_size, base_lr, + layer_counts, params=params) + if 'v2' in model: + self.version = 'v2' + elif 'v1.5' in model: + self.version = 'v1.5' + else: + self.version = 'v1' + + def add_inference(self, cnn): + if self.layer_counts is None: + raise ValueError('Layer counts not specified for %s' % self.get_model()) + # Drop batch size from shape logging. + mlperf.logger.log(key=mlperf.tags.MODEL_HP_INITIAL_SHAPE, + value=cnn.top_layer.shape.as_list()[1:]) + cnn.use_batch_norm = True + cnn.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True} + if self.use_deep_stem: + cnn.conv(32, 3, 3, 2, 2, mode='SAME_RESNET', use_batch_norm=True) + cnn.conv(32, 3, 3, 1, 1, mode='SAME_RESNET', use_batch_norm=True) + cnn.conv(64, 3, 3, 1, 1, mode='SAME_RESNET', use_batch_norm=True) + else: + cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True) + cnn.mpool(3, 3, 2, 2, mode='SAME') + for _ in xrange(self.layer_counts[0]): + bottleneck_block(cnn, 256, 64, 1, self.version) + for i in xrange(self.layer_counts[1]): + stride = 2 if i == 0 else 1 + bottleneck_block(cnn, 512, 128, stride, self.version) + for i in xrange(self.layer_counts[2]): + stride = 2 if i == 0 else 1 + bottleneck_block(cnn, 1024, 256, stride, self.version) + for i in xrange(self.layer_counts[3]): + stride = 2 if i == 0 else 1 + bottleneck_block(cnn, 2048, 512, stride, self.version) + if self.version == 'v2': + cnn.batch_norm() + cnn.top_layer = tf.nn.relu(cnn.top_layer) + cnn.spatial_mean() + + def get_learning_rate(self, global_step, batch_size): + rescaled_lr = self.get_scaled_base_learning_rate(batch_size) + num_batches_per_epoch = ( + datasets.IMAGENET_NUM_TRAIN_IMAGES / batch_size) + boundaries = [int(num_batches_per_epoch * x) for x in [30, 60, 80, 90]] + values = [1, 0.1, 0.01, 0.001, 0.0001] + values = [rescaled_lr * v for v in values] + lr = tf.train.piecewise_constant(global_step, boundaries, values) + warmup_steps = int(num_batches_per_epoch * 5) + mlperf.logger.log(key=mlperf.tags.OPT_LR_WARMUP_STEPS, value=warmup_steps) + warmup_lr = ( + rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast( + warmup_steps, tf.float32)) + return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr) + + def get_scaled_base_learning_rate(self, batch_size): + """Calculates base learning rate for creating lr schedule. + + In replicated mode, gradients are summed rather than averaged which, with + the sgd and momentum optimizers, increases the effective learning rate by + lr * num_gpus. Dividing the base lr by num_gpus negates the increase. + + Args: + batch_size: Total batch-size. + + Returns: + Base learning rate to use to create lr schedule. + """ + base_lr = self.learning_rate + if self.params.variable_update == 'replicated': + base_lr = self.learning_rate / self.params.num_gpus + scaled_lr = base_lr * (batch_size / self.base_lr_batch_size) + return scaled_lr + + +def create_resnet50_model(params): + return ResnetModel('resnet50', (3, 4, 6, 3), params=params) + + +def create_resnet50_v1_5_model(params): + return ResnetModel('resnet50_v1.5', (3, 4, 6, 3), params=params) + + +def create_resnet50_v2_model(params): + return ResnetModel('resnet50_v2', (3, 4, 6, 3), params=params) + + +def create_resnet101_model(params): + return ResnetModel('resnet101', (3, 4, 23, 3), params=params) + + +def create_resnet101_v2_model(params): + return ResnetModel('resnet101_v2', (3, 4, 23, 3), params=params) + + +def create_resnet152_model(params): + return ResnetModel('resnet152', (3, 8, 36, 3), params=params) + + +def create_resnet152_v2_model(params): + return ResnetModel('resnet152_v2', (3, 8, 36, 3), params=params) + + +class ResnetCifar10Model(model_lib.CNNModel): + """Resnet cnn network configuration for Cifar 10 dataset. + + V1 model architecture follows the one defined in the paper: + https://arxiv.org/pdf/1512.03385.pdf. + + V2 model architecture follows the one defined in the paper: + https://arxiv.org/pdf/1603.05027.pdf. + """ + + def __init__(self, model, layer_counts, params=None): + if 'v2' in model: + self.version = 'v2' + else: + self.version = 'v1' + super(ResnetCifar10Model, self).__init__( + model, 32, 128, 0.1, layer_counts, params=params) + + def add_inference(self, cnn): + if self.layer_counts is None: + raise ValueError('Layer counts not specified for %s' % self.get_model()) + + cnn.use_batch_norm = True + cnn.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True} + if self.version == 'v2': + cnn.conv(16, 3, 3, 1, 1, use_batch_norm=True) + else: + cnn.conv(16, 3, 3, 1, 1, activation=None, use_batch_norm=True) + for i in xrange(self.layer_counts[0]): + # reshape to batch_size x 16 x 32 x 32 + residual_block(cnn, 16, 1, self.version) + for i in xrange(self.layer_counts[1]): + # Subsampling is performed at the first convolution with a stride of 2 + stride = 2 if i == 0 else 1 + # reshape to batch_size x 32 x 16 x 16 + residual_block(cnn, 32, stride, self.version) + for i in xrange(self.layer_counts[2]): + stride = 2 if i == 0 else 1 + # reshape to batch_size x 64 x 8 x 8 + residual_block(cnn, 64, stride, self.version) + if self.version == 'v2': + cnn.batch_norm() + cnn.top_layer = tf.nn.relu(cnn.top_layer) + cnn.spatial_mean() + + def get_learning_rate(self, global_step, batch_size): + num_batches_per_epoch = int(50000 / batch_size) + boundaries = num_batches_per_epoch * np.array([82, 123, 300], + dtype=np.int64) + boundaries = [x for x in boundaries] + values = [0.1, 0.01, 0.001, 0.0002] + return tf.train.piecewise_constant(global_step, boundaries, values) + + +def create_resnet20_cifar_model(params): + return ResnetCifar10Model('resnet20', (3, 3, 3), params=params) + + +def create_resnet20_v2_cifar_model(params): + return ResnetCifar10Model('resnet20_v2', (3, 3, 3), params=params) + + +def create_resnet32_cifar_model(params): + return ResnetCifar10Model('resnet32', (5, 5, 5), params=params) + + +def create_resnet32_v2_cifar_model(params): + return ResnetCifar10Model('resnet32_v2', (5, 5, 5), params=params) + + +def create_resnet44_cifar_model(params): + return ResnetCifar10Model('resnet44', (7, 7, 7), params=params) + + +def create_resnet44_v2_cifar_model(params): + return ResnetCifar10Model('resnet44_v2', (7, 7, 7), params=params) + + +def create_resnet56_cifar_model(params): + return ResnetCifar10Model('resnet56', (9, 9, 9), params=params) + + +def create_resnet56_v2_cifar_model(params): + return ResnetCifar10Model('resnet56_v2', (9, 9, 9), params=params) + + +def create_resnet110_cifar_model(params): + return ResnetCifar10Model('resnet110', (18, 18, 18), params=params) + + +def create_resnet110_v2_cifar_model(params): + return ResnetCifar10Model('resnet110_v2', (18, 18, 18), params=params) diff --git a/cv/classification/resnet50/tensorflow/models/resnet_model_test.py b/cv/classification/resnet50/tensorflow/models/resnet_model_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b4052fcd2e996c7f02458b6754dfa6dd52635a94 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/resnet_model_test.py @@ -0,0 +1,80 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for resnet_model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import mock +import tensorflow.compat.v1 as tf + +from models import resnet_model + + +class ResNetModelTest(tf.test.TestCase): + + def testGetScaledBaseLearningRateOneGpuLrFromParams(self): + """Verifies setting params.resnet_base_lr pipes through.""" + lr = self._get_scaled_base_learning_rate(1, + 'parameter_server', + 256, + base_lr=.050) + self.assertEqual(lr, .050) + + def testGetScaledBaseLearningRateOneGpu(self): + lr = self._get_scaled_base_learning_rate(1, 'parameter_server', 128) + self.assertEqual(lr, .064) + + def testGetScaledBaseLearningRateEightGpuReplicated(self): + lr = self._get_scaled_base_learning_rate(8, 'replicated', 256 * 8) + self.assertEqual(lr, .128) + + def testGetScaledBaseLearningRateTwoGpuParameter(self): + lr = self._get_scaled_base_learning_rate(2, 'parameter_server', 256 * 2) + self.assertEqual(lr, .256) + + def testGetScaledBaseLearningRateTwoGpuUneven(self): + lr = self._get_scaled_base_learning_rate(2, 'replicated', 13) + self.assertEqual(lr, 0.0032500000000000003) + + def _get_scaled_base_learning_rate(self, + num_gpus, + variable_update, + batch_size, + base_lr=None): + """Simplifies testing different learning rate calculations. + + Args: + num_gpus: Number of GPUs to be used. + variable_update: Type of variable update used. + batch_size: Total batch size. + base_lr: Base learning rate before scaling. + + Returns: + Base learning rate that would be used to create lr schedule. + """ + params = mock.Mock() + params.num_gpus = num_gpus + params.variable_update = variable_update + if base_lr: + params.resnet_base_lr = base_lr + resnet50_model = resnet_model.ResnetModel('resnet50', 50, params=params) + return resnet50_model.get_scaled_base_learning_rate(batch_size) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/__init__.py b/cv/classification/resnet50/tensorflow/models/tf1_only/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet.py b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet.py new file mode 100644 index 0000000000000000000000000000000000000000..e1c2275a51635ae670e753fa8f9952f178fbef94 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet.py @@ -0,0 +1,467 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Mobilenet Base Class, branched from slim for fp16 performance study.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import collections +import contextlib +import copy +import os + +import tensorflow.compat.v1 as tf +from tensorflow.contrib import slim as contrib_slim + +slim = contrib_slim + + +@slim.add_arg_scope +def apply_activation(x, name=None, activation_fn=None): + return activation_fn(x, name=name) if activation_fn else x + + +def _fixed_padding(inputs, kernel_size, rate=1): + """Pads the input along the spatial dimensions independently of input size. + + Pads the input such that if it was used in a convolution with 'VALID' padding, + the output would have the same dimensions as if the unpadded input was used + in a convolution with 'SAME' padding. + + Args: + inputs: A tensor of size [batch, height_in, width_in, channels]. + kernel_size: The kernel to be used in the conv2d or max_pool2d operation. + rate: An integer, rate for atrous convolution. + + Returns: + output: A tensor of size [batch, height_out, width_out, channels] with the + input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). + """ + kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), + kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] + pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] + pad_beg = [pad_total[0] // 2, pad_total[1] // 2] + pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], + [pad_beg[1], pad_end[1]], [0, 0]]) + return padded_inputs + + +def _make_divisible(v, divisor, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +@contextlib.contextmanager +def _set_arg_scope_defaults(defaults): + """Sets arg scope defaults for all items present in defaults. + + Args: + defaults: dictionary/list of pairs, containing a mapping from + function to a dictionary of default args. + + Yields: + context manager where all defaults are set. + """ + if hasattr(defaults, 'items'): + items = list(defaults.items()) + else: + items = defaults + if not items: + yield + else: + func, default_arg = items[0] + with slim.arg_scope(func, **default_arg): + with _set_arg_scope_defaults(items[1:]): + yield + + +@slim.add_arg_scope +def depth_multiplier(output_params, + multiplier, + divisible_by=8, + min_depth=8, + **unused_kwargs): + if 'num_outputs' not in output_params: + return + d = output_params['num_outputs'] + output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by, + min_depth) + + +_Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func']) + + +def op(opfunc, **params): + multiplier = params.pop('multiplier_transorm', depth_multiplier) + return _Op(opfunc, params=params, multiplier_func=multiplier) + + +class NoOpScope(object): + """No-op context manager.""" + + def __enter__(self): + return + + def __exit__(self, exc_type, exc_value, traceback): + return False + + +def safe_arg_scope(funcs, **kwargs): + """Returns `slim.arg_scope` with all None arguments removed. + + Arguments: + funcs: Functions to pass to `arg_scope`. + **kwargs: Arguments to pass to `arg_scope`. + + Returns: + arg_scope or No-op context manager. + + Note: can be useful if None value should be interpreted as "do not overwrite + this parameter value". + """ + filtered_args = {name: value for name, value in kwargs.items() + if value is not None} + if filtered_args: + return slim.arg_scope(funcs, **filtered_args) + else: + return NoOpScope() + + +@slim.add_arg_scope +def mobilenet_base( # pylint: disable=invalid-name + inputs, + conv_defs, + multiplier=1.0, + final_endpoint=None, + output_stride=None, + use_explicit_padding=False, + scope=None, + is_training=False): + """Mobilenet base network. + + Constructs a network from inputs to the given final endpoint. By default + the network is constructed in inference mode. To create network + in training mode use: + + with slim.arg_scope(mobilenet.training_scope()): + logits, endpoints = mobilenet_base(...) + + Args: + inputs: a tensor of shape [batch_size, height, width, channels]. + conv_defs: A list of op(...) layers specifying the net architecture. + multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + final_endpoint: The name of last layer, for early termination for + for V1-based networks: last layer is "layer_14", for V2: "layer_20" + output_stride: An integer that specifies the requested ratio of input to + output spatial resolution. If not None, then we invoke atrous convolution + if necessary to prevent the network from reducing the spatial resolution + of the activation maps. Allowed values are 1 or any even number, excluding + zero. Typical values are 8 (accurate fully convolutional mode), 16 + (fast fully convolutional mode), and 32 (classification mode). + + NOTE- output_stride relies on all consequent operators to support dilated + operators via "rate" parameter. This might require wrapping non-conv + operators to operate properly. + + use_explicit_padding: Use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + scope: optional variable scope. + is_training: How to setup batch_norm and other ops. Note: most of the time + this does not need be set directly. Use mobilenet.training_scope() to set + up training instead. This parameter is here for backward compatibility + only. It is safe to set it to the value matching + training_scope(is_training=...). It is also safe to explicitly set + it to False, even if there is outer training_scope set to to training. + (The network will be built in inference mode). If this is set to None, + no arg_scope is added for slim.batch_norm's is_training parameter. + + Returns: + tensor_out: output tensor. + end_points: a set of activations for external use, for example summaries or + losses. + + Raises: + ValueError: depth_multiplier <= 0, or the target output_stride is not + allowed. + """ + if multiplier <= 0: + raise ValueError('multiplier is not greater than zero.') + + # Set conv defs defaults and overrides. + conv_defs_defaults = conv_defs.get('defaults', {}) + conv_defs_overrides = conv_defs.get('overrides', {}) + if use_explicit_padding: + conv_defs_overrides = copy.deepcopy(conv_defs_overrides) + conv_defs_overrides[ + (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'} + + if output_stride is not None: + if output_stride == 0 or (output_stride > 1 and output_stride % 2): + raise ValueError('Output stride must be None, 1 or a multiple of 2.') + + # a) Set the tensorflow scope + # b) set padding to default: note we might consider removing this + # since it is also set by mobilenet_scope + # c) set all defaults + # d) set all extra overrides. + with _scope_all(scope, default_scope='Mobilenet'), \ + safe_arg_scope([slim.batch_norm], is_training=is_training), \ + _set_arg_scope_defaults(conv_defs_defaults), \ + _set_arg_scope_defaults(conv_defs_overrides): + # The current_stride variable keeps track of the output stride of the + # activations, i.e., the running product of convolution strides up to the + # current network layer. This allows us to invoke atrous convolution + # whenever applying the next convolution would result in the activations + # having output stride larger than the target output_stride. + current_stride = 1 + + # The atrous convolution rate parameter. + rate = 1 + + net = inputs + # Insert default parameters before the base scope which includes + # any custom overrides set in mobilenet. + end_points = {} + scopes = {} + for i, opdef in enumerate(conv_defs['spec']): + params = dict(opdef.params) + opdef.multiplier_func(params, multiplier) + stride = params.get('stride', 1) + if output_stride is not None and current_stride == output_stride: + # If we have reached the target output_stride, then we need to employ + # atrous convolution with stride=1 and multiply the atrous rate by the + # current unit's stride for use in subsequent layers. + layer_stride = 1 + layer_rate = rate + rate *= stride + else: + layer_stride = stride + layer_rate = 1 + current_stride *= stride + # Update params. + params['stride'] = layer_stride + # Only insert rate to params if rate > 1. + if layer_rate > 1: + params['rate'] = layer_rate + # Set padding + if use_explicit_padding: + if 'kernel_size' in params: + net = _fixed_padding(net, params['kernel_size'], layer_rate) + else: + params['use_explicit_padding'] = True + + end_point = 'layer_%d' % (i + 1) + try: + net = opdef.op(net, **params) + except Exception: + print('Failed to create op %i: %r params: %r' % (i, opdef, params)) + raise + end_points[end_point] = net + scope = os.path.dirname(net.name) + scopes[scope] = end_point + if final_endpoint is not None and end_point == final_endpoint: + break + + # Add all tensors that end with 'output' to + # endpoints + for t in net.graph.get_operations(): + scope = os.path.dirname(t.name) + bn = os.path.basename(t.name) + if scope in scopes and t.name.endswith('output'): + end_points[scopes[scope] + '/' + bn] = t.outputs[0] + return net, end_points + + +@contextlib.contextmanager +def _scope_all(scope, default_scope=None): + with tf.variable_scope(scope, default_name=default_scope) as s,\ + tf.name_scope(s.original_name_scope): + yield s + + +@slim.add_arg_scope +def mobilenet(inputs, + num_classes=1001, + prediction_fn=slim.softmax, + reuse=None, + scope='Mobilenet', + base_only=False, + **mobilenet_args): + """Mobilenet model for classification, supports both V1 and V2. + + Note: default mode is inference, use mobilenet.training_scope to create + training network. + + + Args: + inputs: a tensor of shape [batch_size, height, width, channels]. + num_classes: number of predicted classes. If 0 or None, the logits layer + is omitted and the input features to the logits layer (before dropout) + are returned instead. + prediction_fn: a function to get predictions out of logits + (default softmax). + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + base_only: if True will only create the base of the network (no pooling + and no logits). + **mobilenet_args: passed to mobilenet_base verbatim. + - conv_defs: list of conv defs + - multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + - output_stride: will ensure that the last layer has at most total stride. + If the architecture calls for more stride than that provided + (e.g. output_stride=16, but the architecture has 5 stride=2 operators), + it will replace output_stride with fractional convolutions using Atrous + Convolutions. + + Returns: + logits: the pre-softmax activations, a tensor of size + [batch_size, num_classes] + end_points: a dictionary from components of the network to the corresponding + activation tensor. + + Raises: + ValueError: Input rank is invalid. + """ + is_training = mobilenet_args.get('is_training', False) + input_shape = inputs.get_shape().as_list() + if len(input_shape) != 4: + raise ValueError('Expected rank 4 input, was: %d' % len(input_shape)) + + with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope: + inputs = tf.identity(inputs, 'input') + net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args) + if base_only: + return net, end_points + + net = tf.identity(net, name='embedding') + + with tf.variable_scope('Logits'): + net = global_pool(net) + end_points['global_pool'] = net + if not num_classes: + return net, end_points + net = slim.dropout(net, scope='Dropout', is_training=is_training) + # 1 x 1 x num_classes + # Note: legacy scope name. + logits = slim.conv2d( + net, + num_classes, [1, 1], + activation_fn=None, + normalizer_fn=None, + biases_initializer=tf.zeros_initializer(), + scope='Conv2d_1c_1x1') + + logits = tf.squeeze(logits, [1, 2]) + + logits = tf.identity(logits, name='output') + end_points['Logits'] = logits + if prediction_fn: + end_points['Predictions'] = prediction_fn(logits, 'Predictions') + return logits, end_points + + +def global_pool(input_tensor, pool_op=tf.nn.avg_pool): + """Applies avg pool to produce 1x1 output. + + NOTE: This function is funcitonally equivalenet to reduce_mean, but it has + baked in average pool which has better support across hardware. + + Args: + input_tensor: input tensor + pool_op: pooling op (avg pool is default) + Returns: + a tensor batch_size x 1 x 1 x depth. + """ + shape = input_tensor.get_shape().as_list() + if shape[1] is None or shape[2] is None: + kernel_size = tf.convert_to_tensor( + [1, tf.shape(input_tensor)[1], + tf.shape(input_tensor)[2], 1]) + else: + kernel_size = [1, shape[1], shape[2], 1] + output = pool_op( + input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID') + # Recover output shape, for unknown shape. + output.set_shape([None, 1, 1, None]) + return output + + +def training_scope(is_training=True, + weight_decay=0.00004, + stddev=0.09, + dropout_keep_prob=0.8, + bn_decay=0.997): + """Defines Mobilenet training scope. + + Usage: + with tf.contrib.slim.arg_scope(mobilenet.training_scope()): + logits, endpoints = mobilenet_v2.mobilenet(input_tensor) + + # the network created will be trainble with dropout/batch norm + # initialized appropriately. + Args: + is_training: if set to False this will ensure that all customizations are + set to non-training mode. This might be helpful for code that is reused + across both training/evaluation, but most of the time training_scope with + value False is not needed. If this is set to None, the parameters is not + added to the batch_norm arg_scope. + + weight_decay: The weight decay to use for regularizing the model. + stddev: Standard deviation for initialization, if negative uses xavier. + dropout_keep_prob: dropout keep probability (not set if equals to None). + bn_decay: decay for the batch norm moving averages (not set if equals to + None). + + Returns: + An argument scope to use via arg_scope. + """ + # Note: do not introduce parameters that would change the inference + # model here (for example whether to use bias), modify conv_def instead. + batch_norm_params = { + 'decay': bn_decay, + 'is_training': is_training + } + if stddev < 0: + weight_intitializer = slim.initializers.xavier_initializer() + else: + weight_intitializer = tf.truncated_normal_initializer(stddev=stddev) + + # Set weight_decay for weights in Conv and FC layers. + with slim.arg_scope( + [slim.conv2d, slim.fully_connected, slim.separable_conv2d], + weights_initializer=weight_intitializer, + normalizer_fn=slim.batch_norm), \ + slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\ + safe_arg_scope([slim.batch_norm], **batch_norm_params), \ + safe_arg_scope([slim.dropout], is_training=is_training, + keep_prob=dropout_keep_prob), \ + slim.arg_scope([slim.conv2d], \ + weights_regularizer=slim.l2_regularizer(weight_decay)), \ + slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s: + return s diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_conv_blocks.py b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_conv_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..34016b277b6cc90700984a44247fb971ce708277 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_conv_blocks.py @@ -0,0 +1,360 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Convolution blocks for mobilenet.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import contextlib +import functools + +import tensorflow.compat.v1 as tf +from tensorflow.contrib import slim + + +def _fixed_padding(inputs, kernel_size, rate=1): + """Pads the input along the spatial dimensions independently of input size. + + Pads the input such that if it was used in a convolution with 'VALID' padding, + the output would have the same dimensions as if the unpadded input was used + in a convolution with 'SAME' padding. + + Args: + inputs: A tensor of size [batch, height_in, width_in, channels]. + kernel_size: The kernel to be used in the conv2d or max_pool2d operation. + rate: An integer, rate for atrous convolution. + + Returns: + output: A tensor of size [batch, height_out, width_out, channels] with the + input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). + """ + kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), + kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] + pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] + pad_beg = [pad_total[0] // 2, pad_total[1] // 2] + pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], + [pad_beg[1], pad_end[1]], [0, 0]]) + return padded_inputs + + +def _make_divisible(v, divisor, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +def _split_divisible(num, num_ways, divisible_by=8): + """Evenly splits num, num_ways so each piece is a multiple of divisible_by.""" + assert num % divisible_by == 0 + assert num // num_ways >= divisible_by + # Note: want to round down, we adjust each split to match the total. + base = num // num_ways // divisible_by * divisible_by + result = [] + accumulated = 0 + for i in range(num_ways): + r = base + while accumulated + r < num * (i + 1) // num_ways: + r += divisible_by + result.append(r) + accumulated += r + assert accumulated == num + return result + + +@contextlib.contextmanager +def _v1_compatible_scope_naming(scope): # pylint: disable=g-missing-docstring + if scope is None: # Create uniqified separable blocks. + with tf.variable_scope(None, default_name='separable') as s, \ + tf.name_scope(s.original_name_scope): + yield '' + else: + # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts. + # which provide numbered scopes. + scope += '_' + yield scope + + +@slim.add_arg_scope +def split_separable_conv2d(input_tensor, + num_outputs, + scope=None, + normalizer_fn=None, + stride=1, + rate=1, + endpoints=None, + use_explicit_padding=False): + """Separable mobilenet V1 style convolution. + + Depthwise convolution, with default non-linearity, + followed by 1x1 depthwise convolution. This is similar to + slim.separable_conv2d, but differs in tha it applies batch + normalization and non-linearity to depthwise. This matches + the basic building of Mobilenet Paper + (https://arxiv.org/abs/1704.04861) + + Args: + input_tensor: input + num_outputs: number of outputs + scope: optional name of the scope. Note if provided it will use + scope_depthwise for deptwhise, and scope_pointwise for pointwise. + normalizer_fn: which normalizer function to use for depthwise/pointwise + stride: stride + rate: output rate (also known as dilation rate) + endpoints: optional, if provided, will export additional tensors to it. + use_explicit_padding: Use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + + Returns: + output tesnor + """ + + with _v1_compatible_scope_naming(scope) as scope: + dw_scope = scope + 'depthwise' + endpoints = endpoints if endpoints is not None else {} + kernel_size = [3, 3] + padding = 'SAME' + if use_explicit_padding: + padding = 'VALID' + input_tensor = _fixed_padding(input_tensor, kernel_size, rate) + net = slim.separable_conv2d( + input_tensor, + None, + kernel_size, + depth_multiplier=1, + stride=stride, + rate=rate, + normalizer_fn=normalizer_fn, + padding=padding, + scope=dw_scope) + + endpoints[dw_scope] = net + + pw_scope = scope + 'pointwise' + net = slim.conv2d( + net, + num_outputs, [1, 1], + stride=1, + normalizer_fn=normalizer_fn, + scope=pw_scope) + endpoints[pw_scope] = net + return net + + +def expand_input_by_factor(n, divisible_by=8): + return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by) + + +@slim.add_arg_scope +def expanded_conv(input_tensor, + num_outputs, + expansion_size=expand_input_by_factor(6), + stride=1, + rate=1, + kernel_size=(3, 3), + residual=True, + normalizer_fn=None, + split_projection=1, + split_expansion=1, + expansion_transform=None, + depthwise_location='expansion', + depthwise_channel_multiplier=1, + endpoints=None, + use_explicit_padding=False, + padding='SAME', + scope=None): + """Depthwise Convolution Block with expansion. + + Builds a composite convolution that has the following structure + expansion (1x1) -> depthwise (kernel_size) -> projection (1x1) + + Args: + input_tensor: input + num_outputs: number of outputs in the final layer. + expansion_size: the size of expansion, could be a constant or a callable. + If latter it will be provided 'num_inputs' as an input. For forward + compatibility it should accept arbitrary keyword arguments. + Default will expand the input by factor of 6. + stride: depthwise stride + rate: depthwise rate + kernel_size: depthwise kernel + residual: whether to include residual connection between input + and output. + normalizer_fn: batchnorm or otherwise + split_projection: how many ways to split projection operator + (that is conv expansion->bottleneck) + split_expansion: how many ways to split expansion op + (that is conv bottleneck->expansion) ops will keep depth divisible + by this value. + expansion_transform: Optional function that takes expansion + as a single input and returns output. + depthwise_location: where to put depthwise covnvolutions supported + values None, 'input', 'output', 'expansion' + depthwise_channel_multiplier: depthwise channel multiplier: + each input will replicated (with different filters) + that many times. So if input had c channels, + output will have c x depthwise_channel_multpilier. + endpoints: An optional dictionary into which intermediate endpoints are + placed. The keys "expansion_output", "depthwise_output", + "projection_output" and "expansion_transform" are always populated, even + if the corresponding functions are not invoked. + use_explicit_padding: Use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + padding: Padding type to use if `use_explicit_padding` is not set. + scope: optional scope. + + Returns: + Tensor of depth num_outputs + + Raises: + TypeError: on inval + """ + with tf.variable_scope(scope, default_name='expanded_conv') as s, \ + tf.name_scope(s.original_name_scope): + prev_depth = input_tensor.get_shape().as_list()[3] + if depthwise_location not in [None, 'input', 'output', 'expansion']: + raise TypeError('%r is unknown value for depthwise_location' % + depthwise_location) + if use_explicit_padding: + if padding != 'SAME': + raise TypeError('`use_explicit_padding` should only be used with ' + '"SAME" padding.') + padding = 'VALID' + depthwise_func = functools.partial( + slim.separable_conv2d, + num_outputs=None, + kernel_size=kernel_size, + depth_multiplier=depthwise_channel_multiplier, + stride=stride, + rate=rate, + normalizer_fn=normalizer_fn, + padding=padding, + scope='depthwise') + # b1 -> b2 * r -> b2 + # i -> (o * r) (bottleneck) -> o + input_tensor = tf.identity(input_tensor, 'input') + net = input_tensor + + if depthwise_location == 'input': + if use_explicit_padding: + net = _fixed_padding(net, kernel_size, rate) + net = depthwise_func(net, activation_fn=None) + + if callable(expansion_size): + inner_size = expansion_size(num_inputs=prev_depth) + else: + inner_size = expansion_size + + if inner_size > net.shape[3]: + net = split_conv( + net, + inner_size, + num_ways=split_expansion, + scope='expand', + stride=1, + normalizer_fn=normalizer_fn) + net = tf.identity(net, 'expansion_output') + if endpoints is not None: + endpoints['expansion_output'] = net + + if depthwise_location == 'expansion': + if use_explicit_padding: + net = _fixed_padding(net, kernel_size, rate) + net = depthwise_func(net) + + net = tf.identity(net, name='depthwise_output') + if endpoints is not None: + endpoints['depthwise_output'] = net + if expansion_transform: + net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor) + # Note in contrast with expansion, we always have + # projection to produce the desired output size. + net = split_conv( + net, + num_outputs, + num_ways=split_projection, + stride=1, + scope='project', + normalizer_fn=normalizer_fn, + activation_fn=tf.identity) + if endpoints is not None: + endpoints['projection_output'] = net + if depthwise_location == 'output': + if use_explicit_padding: + net = _fixed_padding(net, kernel_size, rate) + net = depthwise_func(net, activation_fn=None) + + if callable(residual): # custom residual + net = residual(input_tensor=input_tensor, output_tensor=net) + elif (residual and + # stride check enforces that we don't add residuals when spatial + # dimensions are None + stride == 1 and + # Depth matches + net.get_shape().as_list()[3] == + input_tensor.get_shape().as_list()[3]): + net += input_tensor + return tf.identity(net, name='output') + + +def split_conv(input_tensor, + num_outputs, + num_ways, + scope, + divisible_by=8, + **kwargs): + """Creates a split convolution. + + Split convolution splits the input and output into + 'num_blocks' blocks of approximately the same size each, + and only connects $i$-th input to $i$ output. + + Args: + input_tensor: input tensor + num_outputs: number of output filters + num_ways: num blocks to split by. + scope: scope for all the operators. + divisible_by: make sure that every part is divisiable by this. + **kwargs: will be passed directly into conv2d operator + Returns: + tensor + """ + b = input_tensor.get_shape().as_list()[3] + + if num_ways == 1 or min(b // num_ways, + num_outputs // num_ways) < divisible_by: + # Don't do any splitting if we end up with less than 8 filters + # on either side. + return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs) + + outs = [] + input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by) + output_splits = _split_divisible( + num_outputs, num_ways, divisible_by=divisible_by) + inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope) + base = scope + for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)): + scope = base + '_part_%d' % (i,) + n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs) + n = tf.identity(n, scope + '_output') + outs.append(n) + return tf.concat(outs, 3, name=scope + '_concat') diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_test.py b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c0b7d5345077585b99a6a6b5e305388bfcc5eaf0 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_test.py @@ -0,0 +1,191 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for mobilenet_v2, branched from slim for fp16 performance study.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +import tensorflow.compat.v1 as tf + +from models.tf1_only import mobilenet +from models.tf1_only import mobilenet_conv_blocks as ops +from models.tf1_only import mobilenet_v2 +from tensorflow.contrib import slim + + +def find_ops(optype): + """Find ops of a given type in graphdef or a graph. + + Args: + optype: operation type (e.g. Conv2D) + Returns: + List of operations. + """ + gd = tf.get_default_graph() + return [var for var in gd.get_operations() if var.type == optype] + + +class MobilenetV2Test(tf.test.TestCase): + + def setUp(self): # pylint: disable=g-missing-super-call + tf.reset_default_graph() + + def testCreation(self): + spec = dict(mobilenet_v2.V2_DEF) + _, ep = mobilenet.mobilenet( + tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=spec) + num_convs = len(find_ops('Conv2D')) + + # This is mostly a sanity test. No deep reason for these particular + # constants. + # + # All but first 2 and last one have two convolutions, and there is one + # extra conv that is not in the spec. (logits) + self.assertEqual(num_convs, len(spec['spec']) * 2 - 2) + # Check that depthwise are exposed. + for i in range(2, 17): + self.assertIn('layer_%d/depthwise_output' % i, ep) + + def testCreationNoClasses(self): + spec = copy.deepcopy(mobilenet_v2.V2_DEF) + net, ep = mobilenet.mobilenet( + tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=spec, + num_classes=None) + self.assertIs(net, ep['global_pool']) + + def testImageSizes(self): + for input_size, output_size in [(224, 7), (192, 6), (160, 5), + (128, 4), (96, 3)]: + tf.reset_default_graph() + _, ep = mobilenet_v2.mobilenet( + tf.placeholder(tf.float32, (10, input_size, input_size, 3))) + + self.assertEqual(ep['layer_18/output'].get_shape().as_list()[1:3], + [output_size] * 2) + + def testWithSplits(self): + spec = copy.deepcopy(mobilenet_v2.V2_DEF) + spec['overrides'] = { + (ops.expanded_conv,): dict(split_expansion=2), + } + _, _ = mobilenet.mobilenet( + tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=spec) + num_convs = len(find_ops('Conv2D')) + # All but 3 op has 3 conv operatore, the remainign 3 have one + # and there is one unaccounted. + self.assertEqual(num_convs, len(spec['spec']) * 3 - 5) + + def testWithOutputStride8(self): + out, _ = mobilenet.mobilenet_base( + tf.placeholder(tf.float32, (10, 224, 224, 16)), + conv_defs=mobilenet_v2.V2_DEF, + output_stride=8, + scope='MobilenetV2') + self.assertEqual(out.get_shape().as_list()[1:3], [28, 28]) + + def testDivisibleBy(self): + tf.reset_default_graph() + mobilenet_v2.mobilenet( + tf.placeholder(tf.float32, (10, 224, 224, 16)), + conv_defs=mobilenet_v2.V2_DEF, + divisible_by=16, + min_depth=32) + s = [op.outputs[0].get_shape().as_list()[-1] for op in find_ops('Conv2D')] + s = set(s) + self.assertSameElements([32, 64, 96, 160, 192, 320, 384, 576, 960, 1280, + 1001], s) + + def testDivisibleByWithArgScope(self): + tf.reset_default_graph() + # Verifies that depth_multiplier arg scope actually works + # if no default min_depth is provided. + with slim.arg_scope((mobilenet.depth_multiplier,), min_depth=32): + mobilenet_v2.mobilenet( + tf.placeholder(tf.float32, (10, 224, 224, 2)), + conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=0.1) + s = [op.outputs[0].get_shape().as_list()[-1] for op in find_ops('Conv2D')] + s = set(s) + self.assertSameElements(s, [32, 192, 128, 1001]) + + def testFineGrained(self): + tf.reset_default_graph() + # Verifies that depth_multiplier arg scope actually works + # if no default min_depth is provided. + + mobilenet_v2.mobilenet( + tf.placeholder(tf.float32, (10, 224, 224, 2)), + conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=0.01, + finegrain_classification_mode=True) + s = [op.outputs[0].get_shape().as_list()[-1] for op in find_ops('Conv2D')] + s = set(s) + # All convolutions will be 8->48, except for the last one. + self.assertSameElements(s, [8, 48, 1001, 1280]) + + def testMobilenetBase(self): + tf.reset_default_graph() + # Verifies that mobilenet_base returns pre-pooling layer. + with slim.arg_scope((mobilenet.depth_multiplier,), min_depth=32): + net, _ = mobilenet_v2.mobilenet_base( + tf.placeholder(tf.float32, (10, 224, 224, 16)), + conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=0.1) + self.assertEqual(net.get_shape().as_list(), [10, 7, 7, 128]) + + def testWithOutputStride16(self): + tf.reset_default_graph() + out, _ = mobilenet.mobilenet_base( + tf.placeholder(tf.float32, (10, 224, 224, 16)), + conv_defs=mobilenet_v2.V2_DEF, + output_stride=16) + self.assertEqual(out.get_shape().as_list()[1:3], [14, 14]) + + def testWithOutputStride8AndExplicitPadding(self): + tf.reset_default_graph() + out, _ = mobilenet.mobilenet_base( + tf.placeholder(tf.float32, (10, 224, 224, 16)), + conv_defs=mobilenet_v2.V2_DEF, + output_stride=8, + use_explicit_padding=True, + scope='MobilenetV2') + self.assertEqual(out.get_shape().as_list()[1:3], [28, 28]) + + def testWithOutputStride16AndExplicitPadding(self): + tf.reset_default_graph() + out, _ = mobilenet.mobilenet_base( + tf.placeholder(tf.float32, (10, 224, 224, 16)), + conv_defs=mobilenet_v2.V2_DEF, + output_stride=16, + use_explicit_padding=True) + self.assertEqual(out.get_shape().as_list()[1:3], [14, 14]) + + def testBatchNormScopeDoesNotHaveIsTrainingWhenItsSetToNone(self): + sc = mobilenet.training_scope(is_training=None) + self.assertNotIn('is_training', sc[slim.arg_scope_func_key( + slim.batch_norm)]) + + def testBatchNormScopeDoesHasIsTrainingWhenItsNotNone(self): + sc = mobilenet.training_scope(is_training=False) + self.assertIn('is_training', sc[slim.arg_scope_func_key(slim.batch_norm)]) + sc = mobilenet.training_scope(is_training=True) + self.assertIn('is_training', sc[slim.arg_scope_func_key(slim.batch_norm)]) + sc = mobilenet.training_scope() + self.assertIn('is_training', sc[slim.arg_scope_func_key(slim.batch_norm)]) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_v2.py b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..ac811470719c6a3f867fd88484aaa862bce09e76 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_v2.py @@ -0,0 +1,198 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Mobilenet V2 model, branched from slim models for fp16 performance study. + +Architecture: https://arxiv.org/abs/1801.04381 + +The base model gives 72.2% accuracy on ImageNet, with 300MMadds, +3.4 M parameters. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +import tensorflow.compat.v1 as tf + +from models import model +from models.tf1_only import mobilenet as lib +from models.tf1_only import mobilenet_conv_blocks as ops +from tensorflow.contrib import slim + +op = lib.op + +expand_input = ops.expand_input_by_factor + +# pyformat: disable +# Architecture: https://arxiv.org/abs/1801.04381 +V2_DEF = dict( + defaults={ + # Note: these parameters of batch norm affect the architecture + # that's why they are here and not in training_scope. + (slim.batch_norm,): {'center': True, 'scale': True}, + (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { + 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 + }, + (ops.expanded_conv,): { + 'expansion_size': expand_input(6), + 'split_expansion': 1, + 'normalizer_fn': slim.batch_norm, + 'residual': True + }, + (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'} + }, + spec=[ + op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]), + op(ops.expanded_conv, + expansion_size=expand_input(1, divisible_by=1), + num_outputs=16), + op(ops.expanded_conv, stride=2, num_outputs=24), + op(ops.expanded_conv, stride=1, num_outputs=24), + op(ops.expanded_conv, stride=2, num_outputs=32), + op(ops.expanded_conv, stride=1, num_outputs=32), + op(ops.expanded_conv, stride=1, num_outputs=32), + op(ops.expanded_conv, stride=2, num_outputs=64), + op(ops.expanded_conv, stride=1, num_outputs=64), + op(ops.expanded_conv, stride=1, num_outputs=64), + op(ops.expanded_conv, stride=1, num_outputs=64), + op(ops.expanded_conv, stride=1, num_outputs=96), + op(ops.expanded_conv, stride=1, num_outputs=96), + op(ops.expanded_conv, stride=1, num_outputs=96), + op(ops.expanded_conv, stride=2, num_outputs=160), + op(ops.expanded_conv, stride=1, num_outputs=160), + op(ops.expanded_conv, stride=1, num_outputs=160), + op(ops.expanded_conv, stride=1, num_outputs=320), + op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280) + ], +) +# pyformat: enable + + +@slim.add_arg_scope +def mobilenet(input_tensor, + num_classes=1001, + depth_multiplier=1.0, + scope='MobilenetV2', + conv_defs=None, + finegrain_classification_mode=False, + min_depth=None, + divisible_by=None, + **kwargs): + """Creates mobilenet V2 network. + + Inference mode is created by default. To create training use training_scope + below. + + with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): + logits, endpoints = mobilenet_v2.mobilenet(input_tensor) + + Args: + input_tensor: The input tensor + num_classes: number of classes + depth_multiplier: The multiplier applied to scale number of + channels in each layer. Note: this is called depth multiplier in the + paper but the name is kept for consistency with slim's model builder. + scope: Scope of the operator + conv_defs: Allows to override default conv def. + finegrain_classification_mode: When set to True, the model + will keep the last layer large even for small multipliers. Following + https://arxiv.org/abs/1801.04381 + suggests that it improves performance for ImageNet-type of problems. + *Note* ignored if final_endpoint makes the builder exit earlier. + min_depth: If provided, will ensure that all layers will have that + many channels after application of depth multiplier. + divisible_by: If provided will ensure that all layers # channels + will be divisible by this number. + **kwargs: passed directly to mobilenet.mobilenet: + prediction_fn- what prediction function to use. + reuse-: whether to reuse variables (if reuse set to true, scope + must be given). + Returns: + logits/endpoints pair + + Raises: + ValueError: On invalid arguments + """ + if conv_defs is None: + conv_defs = V2_DEF + if 'multiplier' in kwargs: + raise ValueError('mobilenetv2 doesn\'t support generic ' + 'multiplier parameter use "depth_multiplier" instead.') + if finegrain_classification_mode: + conv_defs = copy.deepcopy(conv_defs) + if depth_multiplier < 1: + conv_defs['spec'][-1].params['num_outputs'] /= depth_multiplier + + depth_args = {} + # NB: do not set depth_args unless they are provided to avoid overriding + # whatever default depth_multiplier might have thanks to arg_scope. + if min_depth is not None: + depth_args['min_depth'] = min_depth + if divisible_by is not None: + depth_args['divisible_by'] = divisible_by + + with slim.arg_scope((lib.depth_multiplier,), **depth_args): + return lib.mobilenet( + input_tensor, + num_classes=num_classes, + conv_defs=conv_defs, + scope=scope, + multiplier=depth_multiplier, + **kwargs) + + +@slim.add_arg_scope +def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs): + """Creates base of the mobilenet (no pooling and no logits) .""" + return mobilenet( + input_tensor, depth_multiplier=depth_multiplier, base_only=True, **kwargs) + + +def training_scope(**kwargs): + """Defines MobilenetV2 training scope. + + Usage: + with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): + logits, endpoints = mobilenet_v2.mobilenet(input_tensor) + + with slim. + + Args: + **kwargs: Passed to mobilenet.training_scope. The following parameters + are supported: + weight_decay- The weight decay to use for regularizing the model. + stddev- Standard deviation for initialization, if negative uses xavier. + dropout_keep_prob- dropout keep probability + bn_decay- decay for the batch norm moving averages. + + Returns: + An `arg_scope` to use for the mobilenet v2 model. + """ + return lib.training_scope(**kwargs) + + +class MobilenetModel(model.CNNModel): + """Mobilenet model configuration.""" + + def __init__(self, params=None): + super(MobilenetModel, self).__init__( + 'mobilenet', 224, 32, 0.005, params=params) + + def add_inference(self, cnn): + with slim.arg_scope(training_scope(is_training=cnn.phase_train)): + cnn.top_layer, _ = mobilenet(cnn.top_layer, is_training=cnn.phase_train) + cnn.top_size = cnn.top_layer.shape[-1].value diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_model.py b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_model.py new file mode 100644 index 0000000000000000000000000000000000000000..560d86bcaf88589734696748379150a6615a58fc --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_model.py @@ -0,0 +1,582 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Model configurations for nasnet. + +Paper: https://arxiv.org/abs/1707.07012 +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf + +from models import model +from models.tf1_only import nasnet_utils +from tensorflow.contrib import framework as contrib_framework +from tensorflow.contrib import layers as contrib_layers +from tensorflow.contrib import slim +from tensorflow.contrib import training as contrib_training + +arg_scope = contrib_framework.arg_scope + + +# Notes for training NASNet Cifar Model +# ------------------------------------- +# batch_size: 32 +# learning rate: 0.025 +# cosine (single period) learning rate decay +# auxiliary head loss weighting: 0.4 +# clip global norm of all gradients by 5 +def _cifar_config(is_training=True, data_format=None, total_steps=None): + drop_path_keep_prob = 1.0 if not is_training else 0.6 + return contrib_training.HParams( + stem_multiplier=3.0, + drop_path_keep_prob=drop_path_keep_prob, + num_cells=18, + use_aux_head=1, + num_conv_filters=32, + dense_dropout_keep_prob=1.0, + filter_scaling_rate=2.0, + num_reduction_layers=2, + skip_reduction_layer_input=0, + data_format=data_format or 'NHWC', + # 600 epochs with a batch size of 32 + # This is used for the drop path probabilities since it needs to increase + # the drop out probability over the course of training. + total_training_steps=total_steps or 937500, + ) + + +# Notes for training large NASNet model on ImageNet +# ------------------------------------- +# batch size (per replica): 16 +# learning rate: 0.015 * 100 +# learning rate decay factor: 0.97 +# num epochs per decay: 2.4 +# sync sgd with 100 replicas +# auxiliary head loss weighting: 0.4 +# label smoothing: 0.1 +# clip global norm of all gradients by 10 +def _large_imagenet_config(is_training=True, data_format=None, + total_steps=None): + drop_path_keep_prob = 1.0 if not is_training else 0.7 + return contrib_training.HParams( + stem_multiplier=3.0, + dense_dropout_keep_prob=0.5, + num_cells=18, + filter_scaling_rate=2.0, + num_conv_filters=168, + drop_path_keep_prob=drop_path_keep_prob, + use_aux_head=1, + num_reduction_layers=2, + skip_reduction_layer_input=1, + data_format=data_format or 'NHWC', + total_training_steps=total_steps or 250000, + ) + + +# Notes for training the mobile NASNet ImageNet model +# ------------------------------------- +# batch size (per replica): 32 +# learning rate: 0.04 * 50 +# learning rate scaling factor: 0.97 +# num epochs per decay: 2.4 +# sync sgd with 50 replicas +# auxiliary head weighting: 0.4 +# label smoothing: 0.1 +# clip global norm of all gradients by 10 +def _mobile_imagenet_config(data_format=None, total_steps=None): + return contrib_training.HParams( + stem_multiplier=1.0, + dense_dropout_keep_prob=0.5, + num_cells=12, + filter_scaling_rate=2.0, + drop_path_keep_prob=1.0, + num_conv_filters=44, + use_aux_head=1, + num_reduction_layers=2, + skip_reduction_layer_input=0, + data_format=data_format or 'NHWC', + total_training_steps=total_steps or 250000, + ) + + +def nasnet_cifar_arg_scope(weight_decay=5e-4, + batch_norm_decay=0.9, + batch_norm_epsilon=1e-5): + """Defines the default arg scope for the NASNet-A Cifar model. + + Args: + weight_decay: The weight decay to use for regularizing the model. + batch_norm_decay: Decay for batch norm moving average. + batch_norm_epsilon: Small float added to variance to avoid dividing by zero + in batch norm. + Returns: + An `arg_scope` to use for the NASNet Cifar Model. + """ + batch_norm_params = { + # Decay for the moving averages. + 'decay': batch_norm_decay, + # epsilon to prevent 0s in variance. + 'epsilon': batch_norm_epsilon, + 'scale': True, + 'fused': True, + } + weights_regularizer = contrib_layers.l2_regularizer(weight_decay) + weights_initializer = contrib_layers.variance_scaling_initializer( + mode='FAN_OUT') + with arg_scope( + [slim.fully_connected, slim.conv2d, slim.separable_conv2d], + weights_regularizer=weights_regularizer, + weights_initializer=weights_initializer): + with arg_scope([slim.fully_connected], activation_fn=None, scope='FC'): + with arg_scope( + [slim.conv2d, slim.separable_conv2d], + activation_fn=None, + biases_initializer=None): + with arg_scope([slim.batch_norm], **batch_norm_params) as sc: + return sc + + +def nasnet_mobile_arg_scope(weight_decay=4e-5, + batch_norm_decay=0.9997, + batch_norm_epsilon=1e-3): + """Defines the default arg scope for the NASNet-A Mobile ImageNet model. + + Args: + weight_decay: The weight decay to use for regularizing the model. + batch_norm_decay: Decay for batch norm moving average. + batch_norm_epsilon: Small float added to variance to avoid dividing by zero + in batch norm. + Returns: + An `arg_scope` to use for the NASNet Mobile Model. + """ + batch_norm_params = { + # Decay for the moving averages. + 'decay': batch_norm_decay, + # epsilon to prevent 0s in variance. + 'epsilon': batch_norm_epsilon, + 'scale': True, + 'fused': True, + } + weights_regularizer = contrib_layers.l2_regularizer(weight_decay) + weights_initializer = contrib_layers.variance_scaling_initializer( + mode='FAN_OUT') + with arg_scope( + [slim.fully_connected, slim.conv2d, slim.separable_conv2d], + weights_regularizer=weights_regularizer, + weights_initializer=weights_initializer): + with arg_scope([slim.fully_connected], activation_fn=None, scope='FC'): + with arg_scope( + [slim.conv2d, slim.separable_conv2d], + activation_fn=None, + biases_initializer=None): + with arg_scope([slim.batch_norm], **batch_norm_params) as sc: + return sc + + +def nasnet_large_arg_scope(weight_decay=5e-5, + batch_norm_decay=0.9997, + batch_norm_epsilon=1e-3): + """Defines the default arg scope for the NASNet-A Large ImageNet model. + + Args: + weight_decay: The weight decay to use for regularizing the model. + batch_norm_decay: Decay for batch norm moving average. + batch_norm_epsilon: Small float added to variance to avoid dividing by zero + in batch norm. + Returns: + An `arg_scope` to use for the NASNet Large Model. + """ + batch_norm_params = { + # Decay for the moving averages. + 'decay': batch_norm_decay, + # epsilon to prevent 0s in variance. + 'epsilon': batch_norm_epsilon, + 'scale': True, + 'fused': True, + } + weights_regularizer = contrib_layers.l2_regularizer(weight_decay) + weights_initializer = contrib_layers.variance_scaling_initializer( + mode='FAN_OUT') + with arg_scope( + [slim.fully_connected, slim.conv2d, slim.separable_conv2d], + weights_regularizer=weights_regularizer, + weights_initializer=weights_initializer): + with arg_scope([slim.fully_connected], activation_fn=None, scope='FC'): + with arg_scope( + [slim.conv2d, slim.separable_conv2d], + activation_fn=None, + biases_initializer=None): + with arg_scope([slim.batch_norm], **batch_norm_params) as sc: + return sc + + +def _build_aux_head(net, end_points, num_classes, hparams, scope): + """Auxiliary head used for all models across all datasets.""" + with tf.variable_scope(scope): + aux_logits = tf.identity(net) + with tf.variable_scope('aux_logits'): + aux_logits = slim.avg_pool2d( + aux_logits, [5, 5], stride=3, padding='VALID') + aux_logits = slim.conv2d(aux_logits, 128, [1, 1], scope='proj') + aux_logits = slim.batch_norm(aux_logits, scope='aux_bn0') + aux_logits = tf.nn.relu(aux_logits) + # Shape of feature map before the final layer. + shape = aux_logits.shape + if hparams.data_format == 'NHWC': + shape = shape[1:3] + else: + shape = shape[2:4] + aux_logits = slim.conv2d(aux_logits, 768, shape, padding='VALID') + aux_logits = slim.batch_norm(aux_logits, scope='aux_bn1') + aux_logits = tf.nn.relu(aux_logits) + aux_logits = contrib_layers.flatten(aux_logits) + aux_logits = slim.fully_connected(aux_logits, num_classes) + end_points['AuxLogits'] = aux_logits + + +def _imagenet_stem(inputs, hparams, stem_cell): + """Stem used for models trained on ImageNet.""" + num_stem_cells = 2 + + # 149 x 149 x 32 + num_stem_filters = int(32 * hparams.stem_multiplier) + net = slim.conv2d( + inputs, + num_stem_filters, [3, 3], + stride=2, + scope='conv0', + padding='VALID') + net = slim.batch_norm(net, scope='conv0_bn') + + # Run the reduction cells + cell_outputs = [None, net] + filter_scaling = 1.0 / (hparams.filter_scaling_rate**num_stem_cells) + for cell_num in range(num_stem_cells): + net = stem_cell( + net, + scope='cell_stem_{}'.format(cell_num), + filter_scaling=filter_scaling, + stride=2, + prev_layer=cell_outputs[-2], + cell_num=cell_num) + cell_outputs.append(net) + filter_scaling *= hparams.filter_scaling_rate + return net, cell_outputs + + +def _cifar_stem(inputs, hparams): + """Stem used for models trained on Cifar.""" + num_stem_filters = int(hparams.num_conv_filters * hparams.stem_multiplier) + net = slim.conv2d(inputs, num_stem_filters, 3, scope='l1_stem_3x3') + net = slim.batch_norm(net, scope='l1_stem_bn') + return net, [None, net] + + +def build_nasnet_cifar(images, + num_classes=None, + is_training=True, + data_format=None, + total_steps=None): + """Build NASNet model for the Cifar Dataset.""" + hparams = _cifar_config( + is_training=is_training, data_format=data_format, total_steps=total_steps) + + if tf.test.is_gpu_available() and hparams.data_format == 'NHWC': + tf.logging.info('A GPU is available on the machine, consider using NCHW ' + 'data format for increased speed on GPU.') + + # Calculate the total number of cells in the network + # Add 2 for the reduction cells + total_num_cells = hparams.num_cells + 2 + + normal_cell = nasnet_utils.NasNetANormalCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells, + hparams.total_training_steps) + reduction_cell = nasnet_utils.NasNetAReductionCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells, + hparams.total_training_steps) + with arg_scope( + [slim.dropout, nasnet_utils.drop_path, slim.batch_norm], + is_training=is_training): + with arg_scope( + [ + slim.avg_pool2d, slim.max_pool2d, slim.conv2d, slim.batch_norm, + slim.separable_conv2d, nasnet_utils.factorized_reduction, + nasnet_utils.global_avg_pool, nasnet_utils.get_channel_index, + nasnet_utils.get_channel_dim + ], + data_format=hparams.data_format): + return _build_nasnet_base( + images, + normal_cell=normal_cell, + reduction_cell=reduction_cell, + num_classes=num_classes, + hparams=hparams, + is_training=is_training, + stem_type='cifar') + + +build_nasnet_cifar.default_image_size = 32 + + +def build_nasnet_mobile(images, + num_classes=None, + is_training=True, + data_format=None, + total_steps=None, + final_endpoint=None): + """Build NASNet Mobile model for the ImageNet Dataset.""" + hparams = _mobile_imagenet_config( + data_format=data_format, total_steps=total_steps) + + if tf.test.is_gpu_available() and hparams.data_format == 'NHWC': + tf.logging.info('A GPU is available on the machine, consider using NCHW ' + 'data format for increased speed on GPU.') + + # Calculate the total number of cells in the network + # Add 2 for the reduction cells + total_num_cells = hparams.num_cells + 2 + # If ImageNet, then add an additional two for the stem cells + total_num_cells += 2 + + normal_cell = nasnet_utils.NasNetANormalCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells, + hparams.total_training_steps) + reduction_cell = nasnet_utils.NasNetAReductionCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells, + hparams.total_training_steps) + with arg_scope( + [slim.dropout, nasnet_utils.drop_path, slim.batch_norm], + is_training=is_training): + with arg_scope( + [ + slim.avg_pool2d, slim.max_pool2d, slim.conv2d, slim.batch_norm, + slim.separable_conv2d, nasnet_utils.factorized_reduction, + nasnet_utils.global_avg_pool, nasnet_utils.get_channel_index, + nasnet_utils.get_channel_dim + ], + data_format=hparams.data_format): + return _build_nasnet_base( + images, + normal_cell=normal_cell, + reduction_cell=reduction_cell, + num_classes=num_classes, + hparams=hparams, + is_training=is_training, + stem_type='imagenet', + final_endpoint=final_endpoint) + + +build_nasnet_mobile.default_image_size = 224 + + +def build_nasnet_large(images, + num_classes=None, + is_training=True, + data_format=None, + total_steps=None, + final_endpoint=None): + """Build NASNet Large model for the ImageNet Dataset.""" + hparams = _large_imagenet_config( + is_training=is_training, data_format=data_format, total_steps=total_steps) + + if tf.test.is_gpu_available() and hparams.data_format == 'NHWC': + tf.logging.info('A GPU is available on the machine, consider using NCHW ' + 'data format for increased speed on GPU.') + + # Calculate the total number of cells in the network + # Add 2 for the reduction cells + total_num_cells = hparams.num_cells + 2 + # If ImageNet, then add an additional two for the stem cells + total_num_cells += 2 + + normal_cell = nasnet_utils.NasNetANormalCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells, + hparams.total_training_steps) + reduction_cell = nasnet_utils.NasNetAReductionCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells, + hparams.total_training_steps) + with arg_scope( + [slim.dropout, nasnet_utils.drop_path, slim.batch_norm], + is_training=is_training): + with arg_scope( + [ + slim.avg_pool2d, slim.max_pool2d, slim.conv2d, slim.batch_norm, + slim.separable_conv2d, nasnet_utils.factorized_reduction, + nasnet_utils.global_avg_pool, nasnet_utils.get_channel_index, + nasnet_utils.get_channel_dim + ], + data_format=hparams.data_format): + return _build_nasnet_base( + images, + normal_cell=normal_cell, + reduction_cell=reduction_cell, + num_classes=num_classes, + hparams=hparams, + is_training=is_training, + stem_type='imagenet', + final_endpoint=final_endpoint) + + +build_nasnet_large.default_image_size = 331 + + +def _build_nasnet_base(images, + normal_cell, + reduction_cell, + num_classes, + hparams, + is_training, + stem_type, + final_endpoint=None): + """Constructs a NASNet image model.""" + + end_points = {} + + def add_and_check_endpoint(endpoint_name, net): + end_points[endpoint_name] = net + return final_endpoint and (endpoint_name == final_endpoint) + + # Find where to place the reduction cells or stride normal cells + reduction_indices = nasnet_utils.calc_reduction_layers( + hparams.num_cells, hparams.num_reduction_layers) + stem_cell = reduction_cell + + if stem_type == 'imagenet': + stem = lambda: _imagenet_stem(images, hparams, stem_cell) + elif stem_type == 'cifar': + stem = lambda: _cifar_stem(images, hparams) + else: + raise ValueError('Unknown stem_type: ', stem_type) + net, cell_outputs = stem() + if add_and_check_endpoint('Stem', net): + return net, end_points + + # Setup for building in the auxiliary head. + aux_head_cell_idxes = [] + if len(reduction_indices) >= 2: + aux_head_cell_idxes.append(reduction_indices[1] - 1) + + # Run the cells + filter_scaling = 1.0 + # true_cell_num accounts for the stem cells + true_cell_num = 2 if stem_type == 'imagenet' else 0 + for cell_num in range(hparams.num_cells): + stride = 1 + if hparams.skip_reduction_layer_input: + prev_layer = cell_outputs[-2] + if cell_num in reduction_indices: + filter_scaling *= hparams.filter_scaling_rate + net = reduction_cell( + net, + scope='reduction_cell_{}'.format(reduction_indices.index(cell_num)), + filter_scaling=filter_scaling, + stride=2, + prev_layer=cell_outputs[-2], + cell_num=true_cell_num) + if add_and_check_endpoint( + 'Reduction_Cell_{}'.format(reduction_indices.index(cell_num)), net): + return net, end_points + true_cell_num += 1 + cell_outputs.append(net) + if not hparams.skip_reduction_layer_input: + prev_layer = cell_outputs[-2] + net = normal_cell( + net, + scope='cell_{}'.format(cell_num), + filter_scaling=filter_scaling, + stride=stride, + prev_layer=prev_layer, + cell_num=true_cell_num) + + if add_and_check_endpoint('Cell_{}'.format(cell_num), net): + return net, end_points + true_cell_num += 1 + if (hparams.use_aux_head and cell_num in aux_head_cell_idxes and + num_classes and is_training): + aux_net = tf.nn.relu(net) + _build_aux_head( + aux_net, + end_points, + num_classes, + hparams, + scope='aux_{}'.format(cell_num)) + cell_outputs.append(net) + + # Final softmax layer + with tf.variable_scope('final_layer'): + net = tf.nn.relu(net) + net = nasnet_utils.global_avg_pool(net) + if add_and_check_endpoint('global_pool', net) or num_classes is None: + return net, end_points + net = slim.dropout(net, hparams.dense_dropout_keep_prob, scope='dropout') + logits = slim.fully_connected(net, num_classes) + + if add_and_check_endpoint('Logits', logits): + return net, end_points + + predictions = tf.nn.softmax(logits, name='predictions') + if add_and_check_endpoint('Predictions', predictions): + return net, end_points + return logits, end_points + + +class NasnetModel(model.CNNModel): + """Nasnet model configuration.""" + + def __init__(self, params=None): + super(NasnetModel, self).__init__('nasnet', 224, 32, 0.005, params=params) + + def add_inference(self, cnn): + tf.logging.info('input_image_shape: {}'.format(cnn.top_layer.shape)) + cnn.top_layer, _ = build_nasnet_mobile( + images=cnn.top_layer, + is_training=cnn.phase_train, + data_format=cnn.data_format) + cnn.top_size = cnn.top_layer.shape[-1].value + + +class NasnetLargeModel(model.CNNModel): + """Nasnet model configuration.""" + + def __init__(self, params=None): + super(NasnetLargeModel, self).__init__( + 'nasnet', 331, 16, 0.005, params=params) + + def add_inference(self, cnn): + tf.logging.info('input_image_shape: {}'.format(cnn.top_layer.shape)) + cnn.top_layer, _ = build_nasnet_large( + images=cnn.top_layer, + is_training=cnn.phase_train, + data_format=cnn.data_format) + cnn.top_size = cnn.top_layer.shape[-1].value + + +class NasnetCifarModel(model.CNNModel): + """Nasnet cifar model configuration.""" + + def __init__(self, params=None): + super(NasnetCifarModel, self).__init__( + 'nasnet', 32, 32, 0.025, params=params) + + def add_inference(self, cnn): + tf.logging.info('input_image_shape: {}'.format(cnn.top_layer.shape)) + cnn.top_layer, _ = build_nasnet_cifar( + images=cnn.top_layer, + is_training=cnn.phase_train, + data_format=cnn.data_format) + cnn.top_size = cnn.top_layer.shape[-1].value diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_test.py b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_test.py new file mode 100644 index 0000000000000000000000000000000000000000..4e3bc3776e992c2688e6dd9dfeddbbf7835c6774 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_test.py @@ -0,0 +1,289 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for nasnet.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf + +from models.tf1_only import nasnet_model as nasnet +from tensorflow.contrib import slim + + +class NASNetTest(tf.test.TestCase): + + def testBuildLogitsCifarModel(self): + batch_size = 5 + height, width = 32, 32 + num_classes = 10 + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_cifar_arg_scope()): + logits, end_points = nasnet.build_nasnet_cifar(inputs, num_classes) + auxlogits = end_points['AuxLogits'] + predictions = end_points['Predictions'] + self.assertListEqual(auxlogits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertListEqual(predictions.get_shape().as_list(), + [batch_size, num_classes]) + + def testBuildLogitsMobileModel(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()): + logits, end_points = nasnet.build_nasnet_mobile(inputs, num_classes) + auxlogits = end_points['AuxLogits'] + predictions = end_points['Predictions'] + self.assertListEqual(auxlogits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertListEqual(predictions.get_shape().as_list(), + [batch_size, num_classes]) + + def testBuildLogitsLargeModel(self): + batch_size = 5 + height, width = 331, 331 + num_classes = 1000 + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_large_arg_scope()): + logits, end_points = nasnet.build_nasnet_large(inputs, num_classes) + auxlogits = end_points['AuxLogits'] + predictions = end_points['Predictions'] + self.assertListEqual(auxlogits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertListEqual(predictions.get_shape().as_list(), + [batch_size, num_classes]) + + def testBuildPreLogitsCifarModel(self): + batch_size = 5 + height, width = 32, 32 + num_classes = None + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_cifar_arg_scope()): + net, end_points = nasnet.build_nasnet_cifar(inputs, num_classes) + self.assertNotIn('AuxLogits', end_points) + self.assertNotIn('Predictions', end_points) + self.assertTrue(net.op.name.startswith('final_layer/Mean')) + self.assertListEqual(net.get_shape().as_list(), [batch_size, 768]) + + def testBuildPreLogitsMobileModel(self): + batch_size = 5 + height, width = 224, 224 + num_classes = None + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()): + net, end_points = nasnet.build_nasnet_mobile(inputs, num_classes) + self.assertNotIn('AuxLogits', end_points) + self.assertNotIn('Predictions', end_points) + self.assertTrue(net.op.name.startswith('final_layer/Mean')) + self.assertListEqual(net.get_shape().as_list(), [batch_size, 1056]) + + def testBuildPreLogitsLargeModel(self): + batch_size = 5 + height, width = 331, 331 + num_classes = None + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_large_arg_scope()): + net, end_points = nasnet.build_nasnet_large(inputs, num_classes) + self.assertNotIn('AuxLogits', end_points) + self.assertNotIn('Predictions', end_points) + self.assertTrue(net.op.name.startswith('final_layer/Mean')) + self.assertListEqual(net.get_shape().as_list(), [batch_size, 4032]) + + def testAllEndPointsShapesCifarModel(self): + batch_size = 5 + height, width = 32, 32 + num_classes = 10 + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_cifar_arg_scope()): + _, end_points = nasnet.build_nasnet_cifar(inputs, num_classes) + endpoints_shapes = {'Stem': [batch_size, 32, 32, 96], + 'Cell_0': [batch_size, 32, 32, 192], + 'Cell_1': [batch_size, 32, 32, 192], + 'Cell_2': [batch_size, 32, 32, 192], + 'Cell_3': [batch_size, 32, 32, 192], + 'Cell_4': [batch_size, 32, 32, 192], + 'Cell_5': [batch_size, 32, 32, 192], + 'Cell_6': [batch_size, 16, 16, 384], + 'Cell_7': [batch_size, 16, 16, 384], + 'Cell_8': [batch_size, 16, 16, 384], + 'Cell_9': [batch_size, 16, 16, 384], + 'Cell_10': [batch_size, 16, 16, 384], + 'Cell_11': [batch_size, 16, 16, 384], + 'Cell_12': [batch_size, 8, 8, 768], + 'Cell_13': [batch_size, 8, 8, 768], + 'Cell_14': [batch_size, 8, 8, 768], + 'Cell_15': [batch_size, 8, 8, 768], + 'Cell_16': [batch_size, 8, 8, 768], + 'Cell_17': [batch_size, 8, 8, 768], + 'Reduction_Cell_0': [batch_size, 16, 16, 256], + 'Reduction_Cell_1': [batch_size, 8, 8, 512], + 'global_pool': [batch_size, 768], + # Logits and predictions + 'AuxLogits': [batch_size, num_classes], + 'Logits': [batch_size, num_classes], + 'Predictions': [batch_size, num_classes]} + self.assertCountEqual(endpoints_shapes.keys(), end_points.keys()) + for endpoint_name in endpoints_shapes: + tf.logging.info('Endpoint name: {}'.format(endpoint_name)) + expected_shape = endpoints_shapes[endpoint_name] + self.assertIn(endpoint_name, end_points) + self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), + expected_shape) + + def testAllEndPointsShapesMobileModel(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()): + _, end_points = nasnet.build_nasnet_mobile(inputs, num_classes) + endpoints_shapes = {'Stem': [batch_size, 28, 28, 88], + 'Cell_0': [batch_size, 28, 28, 264], + 'Cell_1': [batch_size, 28, 28, 264], + 'Cell_2': [batch_size, 28, 28, 264], + 'Cell_3': [batch_size, 28, 28, 264], + 'Cell_4': [batch_size, 14, 14, 528], + 'Cell_5': [batch_size, 14, 14, 528], + 'Cell_6': [batch_size, 14, 14, 528], + 'Cell_7': [batch_size, 14, 14, 528], + 'Cell_8': [batch_size, 7, 7, 1056], + 'Cell_9': [batch_size, 7, 7, 1056], + 'Cell_10': [batch_size, 7, 7, 1056], + 'Cell_11': [batch_size, 7, 7, 1056], + 'Reduction_Cell_0': [batch_size, 14, 14, 352], + 'Reduction_Cell_1': [batch_size, 7, 7, 704], + 'global_pool': [batch_size, 1056], + # Logits and predictions + 'AuxLogits': [batch_size, num_classes], + 'Logits': [batch_size, num_classes], + 'Predictions': [batch_size, num_classes]} + self.assertCountEqual(endpoints_shapes.keys(), end_points.keys()) + for endpoint_name in endpoints_shapes: + tf.logging.info('Endpoint name: {}'.format(endpoint_name)) + expected_shape = endpoints_shapes[endpoint_name] + self.assertIn(endpoint_name, end_points) + self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), + expected_shape) + + def testAllEndPointsShapesLargeModel(self): + batch_size = 5 + height, width = 331, 331 + num_classes = 1000 + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + with slim.arg_scope(nasnet.nasnet_large_arg_scope()): + _, end_points = nasnet.build_nasnet_large(inputs, num_classes) + endpoints_shapes = {'Stem': [batch_size, 42, 42, 336], + 'Cell_0': [batch_size, 42, 42, 1008], + 'Cell_1': [batch_size, 42, 42, 1008], + 'Cell_2': [batch_size, 42, 42, 1008], + 'Cell_3': [batch_size, 42, 42, 1008], + 'Cell_4': [batch_size, 42, 42, 1008], + 'Cell_5': [batch_size, 42, 42, 1008], + 'Cell_6': [batch_size, 21, 21, 2016], + 'Cell_7': [batch_size, 21, 21, 2016], + 'Cell_8': [batch_size, 21, 21, 2016], + 'Cell_9': [batch_size, 21, 21, 2016], + 'Cell_10': [batch_size, 21, 21, 2016], + 'Cell_11': [batch_size, 21, 21, 2016], + 'Cell_12': [batch_size, 11, 11, 4032], + 'Cell_13': [batch_size, 11, 11, 4032], + 'Cell_14': [batch_size, 11, 11, 4032], + 'Cell_15': [batch_size, 11, 11, 4032], + 'Cell_16': [batch_size, 11, 11, 4032], + 'Cell_17': [batch_size, 11, 11, 4032], + 'Reduction_Cell_0': [batch_size, 21, 21, 1344], + 'Reduction_Cell_1': [batch_size, 11, 11, 2688], + 'global_pool': [batch_size, 4032], + # Logits and predictions + 'AuxLogits': [batch_size, num_classes], + 'Logits': [batch_size, num_classes], + 'Predictions': [batch_size, num_classes]} + self.assertCountEqual(endpoints_shapes.keys(), end_points.keys()) + for endpoint_name in endpoints_shapes: + tf.logging.info('Endpoint name: {}'.format(endpoint_name)) + expected_shape = endpoints_shapes[endpoint_name] + self.assertIn(endpoint_name, end_points) + self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), + expected_shape) + + def testVariablesSetDeviceMobileModel(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + inputs = tf.random_uniform((batch_size, height, width, 3)) + tf.train.create_global_step() + # Force all Variables to reside on the device. + with tf.variable_scope('on_cpu'), tf.device('/cpu:0'): + with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()): + nasnet.build_nasnet_mobile(inputs, num_classes) + with tf.variable_scope('on_gpu'), tf.device('/gpu:0'): + with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()): + nasnet.build_nasnet_mobile(inputs, num_classes) + for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='on_cpu'): + self.assertDeviceEqual(v.device, '/cpu:0') + for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='on_gpu'): + self.assertDeviceEqual(v.device, '/gpu:0') + + def testUnknownBatchSizeMobileModel(self): + batch_size = 1 + height, width = 224, 224 + num_classes = 1000 + with self.test_session() as sess: + inputs = tf.placeholder(tf.float32, (None, height, width, 3)) + with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()): + logits, _ = nasnet.build_nasnet_mobile(inputs, num_classes) + self.assertListEqual(logits.get_shape().as_list(), + [None, num_classes]) + images = tf.random_uniform((batch_size, height, width, 3)) + sess.run(tf.global_variables_initializer()) + output = sess.run(logits, {inputs: images.eval()}) + self.assertEqual(output.shape, (batch_size, num_classes)) + + def testEvaluationMobileModel(self): + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + with self.test_session() as sess: + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()): + logits, _ = nasnet.build_nasnet_mobile(eval_inputs, + num_classes, + is_training=False) + predictions = tf.argmax(logits, 1) + sess.run(tf.global_variables_initializer()) + output = sess.run(predictions) + self.assertEqual(output.shape, (batch_size,)) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main() diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_utils.py b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9b280b3ea85c35ca9f804ebecbf300d98bda6baa --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_utils.py @@ -0,0 +1,492 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A custom module for some common operations used by NASNet. + +Functions exposed in this file: +- calc_reduction_layers +- get_channel_index +- get_channel_dim +- global_avg_pool +- factorized_reduction +- drop_path + +Classes exposed in this file: +- NasNetABaseCell +- NasNetANormalCell +- NasNetAReductionCell +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf +from tensorflow.contrib import framework as contrib_framework +from tensorflow.contrib import slim + +arg_scope = contrib_framework.arg_scope + +DATA_FORMAT_NCHW = 'NCHW' +DATA_FORMAT_NHWC = 'NHWC' +INVALID = 'null' + + +def calc_reduction_layers(num_cells, num_reduction_layers): + """Figure out what layers should have reductions.""" + reduction_layers = [] + for pool_num in range(1, num_reduction_layers + 1): + layer_num = (float(pool_num) / (num_reduction_layers + 1)) * num_cells + layer_num = int(layer_num) + reduction_layers.append(layer_num) + return reduction_layers + + +@contrib_framework.add_arg_scope +def get_channel_index(data_format=INVALID): + assert data_format != INVALID + axis = 3 if data_format == 'NHWC' else 1 + return axis + + +@contrib_framework.add_arg_scope +def get_channel_dim(shape, data_format=INVALID): + assert data_format != INVALID + assert len(shape) == 4 + if data_format == 'NHWC': + return int(shape[3]) + elif data_format == 'NCHW': + return int(shape[1]) + else: + raise ValueError('Not a valid data_format', data_format) + + +@contrib_framework.add_arg_scope +def global_avg_pool(x, data_format=INVALID): + """Average pool away the height and width spatial dimensions of x.""" + assert data_format != INVALID + assert data_format in ['NHWC', 'NCHW'] + assert x.shape.ndims == 4 + if data_format == 'NHWC': + return tf.reduce_mean(x, [1, 2]) + else: + return tf.reduce_mean(x, [2, 3]) + + +@contrib_framework.add_arg_scope +def factorized_reduction(net, output_filters, stride, data_format=INVALID): + """Reduces the shape of net without information loss due to striding.""" + assert output_filters % 2 == 0, ( + 'Need even number of filters when using this factorized reduction.') + assert data_format != INVALID + if stride == 1: + net = slim.conv2d(net, output_filters, 1, scope='path_conv') + net = slim.batch_norm(net, scope='path_bn') + return net + if data_format == 'NHWC': + stride_spec = [1, stride, stride, 1] + else: + stride_spec = [1, 1, stride, stride] + + # Skip path 1 + path1 = tf.nn.avg_pool( + net, [1, 1, 1, 1], stride_spec, 'VALID', data_format=data_format) + path1 = slim.conv2d(path1, int(output_filters / 2), 1, scope='path1_conv') + + # Skip path 2 + # First pad with 0's on the right and bottom, then shift the filter to + # include those 0's that were added. + if data_format == 'NHWC': + pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]] + path2 = tf.pad(net, pad_arr)[:, 1:, 1:, :] + concat_axis = 3 + else: + pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]] + path2 = tf.pad(net, pad_arr)[:, :, 1:, 1:] + concat_axis = 1 + + path2 = tf.nn.avg_pool( + path2, [1, 1, 1, 1], stride_spec, 'VALID', data_format=data_format) + path2 = slim.conv2d(path2, int(output_filters / 2), 1, scope='path2_conv') + + # Concat and apply BN + final_path = tf.concat(values=[path1, path2], axis=concat_axis) + final_path = slim.batch_norm(final_path, scope='final_path_bn') + return final_path + + +@contrib_framework.add_arg_scope +def drop_path(net, keep_prob, is_training=True): + """Drops out a whole example hiddenstate with the specified probability.""" + if is_training: + batch_size = tf.shape(net)[0] + noise_shape = [batch_size, 1, 1, 1] + keep_prob = tf.cast(keep_prob, dtype=net.dtype) + random_tensor = keep_prob + random_tensor += tf.random_uniform(noise_shape, dtype=net.dtype) + binary_tensor = tf.floor(random_tensor) + net = tf.div(net, keep_prob) * binary_tensor + return net + + +def _operation_to_filter_shape(operation): + splitted_operation = operation.split('x') + filter_shape = int(splitted_operation[0][-1]) + assert filter_shape == int( + splitted_operation[1][0]), 'Rectangular filters not supported.' + return filter_shape + + +def _operation_to_num_layers(operation): + splitted_operation = operation.split('_') + if 'x' in splitted_operation[-1]: + return 1 + return int(splitted_operation[-1]) + + +def _operation_to_info(operation): + """Takes in operation name and returns meta information. + + An example would be 'separable_3x3_4' -> (3, 4). + + Args: + operation: String that corresponds to convolution operation. + + Returns: + Tuple of (filter shape, num layers). + """ + num_layers = _operation_to_num_layers(operation) + filter_shape = _operation_to_filter_shape(operation) + return num_layers, filter_shape + + +def _stacked_separable_conv(net, stride, operation, filter_size): + """Takes in an operations and parses it to the correct sep operation.""" + num_layers, kernel_size = _operation_to_info(operation) + net_type = net.dtype + net = tf.cast(net, tf.float32) if net_type == tf.float16 else net + + for layer_num in range(num_layers - 1): + net = tf.nn.relu(net) + net = slim.separable_conv2d( + net, + filter_size, + kernel_size, + depth_multiplier=1, + scope='separable_{0}x{0}_{1}'.format(kernel_size, layer_num + 1), + stride=stride) + net = slim.batch_norm( + net, scope='bn_sep_{0}x{0}_{1}'.format(kernel_size, layer_num + 1)) + stride = 1 + net = tf.nn.relu(net) + net = slim.separable_conv2d( + net, + filter_size, + kernel_size, + depth_multiplier=1, + scope='separable_{0}x{0}_{1}'.format(kernel_size, num_layers), + stride=stride) + net = slim.batch_norm( + net, scope='bn_sep_{0}x{0}_{1}'.format(kernel_size, num_layers)) + net = tf.cast(net, net_type) + return net + + +def _operation_to_pooling_type(operation): + """Takes in the operation string and returns the pooling type.""" + splitted_operation = operation.split('_') + return splitted_operation[0] + + +def _operation_to_pooling_shape(operation): + """Takes in the operation string and returns the pooling kernel shape.""" + splitted_operation = operation.split('_') + shape = splitted_operation[-1] + assert 'x' in shape + filter_height, filter_width = shape.split('x') + assert filter_height == filter_width + return int(filter_height) + + +def _operation_to_pooling_info(operation): + """Parses the pooling operation string to return its type and shape.""" + pooling_type = _operation_to_pooling_type(operation) + pooling_shape = _operation_to_pooling_shape(operation) + return pooling_type, pooling_shape + + +def _pooling(net, stride, operation): + """Parses operation and performs the correct pooling operation on net.""" + padding = 'SAME' + pooling_type, pooling_shape = _operation_to_pooling_info(operation) + if pooling_type == 'avg': + net = slim.avg_pool2d(net, pooling_shape, stride=stride, padding=padding) + elif pooling_type == 'max': + net = slim.max_pool2d(net, pooling_shape, stride=stride, padding=padding) + else: + raise NotImplementedError('Unimplemented pooling type: ', pooling_type) + return net + + +class NasNetABaseCell(object): # pylint: disable=g-classes-have-attributes + """NASNet Cell class that is used as a 'layer' in image architectures. + + Args: + num_conv_filters: The number of filters for each convolution operation. + operations: List of operations that are performed in the NASNet Cell in + order. + used_hiddenstates: Binary array that signals if the hiddenstate was used + within the cell. This is used to determine what outputs of the cell + should be concatenated together. + hiddenstate_indices: Determines what hiddenstates should be combined + together with the specified operations to create the NASNet cell. + """ + + def __init__(self, num_conv_filters, operations, used_hiddenstates, + hiddenstate_indices, drop_path_keep_prob, total_num_cells, + total_training_steps): + self._num_conv_filters = num_conv_filters + self._operations = operations + self._used_hiddenstates = used_hiddenstates + self._hiddenstate_indices = hiddenstate_indices + self._drop_path_keep_prob = drop_path_keep_prob + self._total_num_cells = total_num_cells + self._total_training_steps = total_training_steps + + def _reduce_prev_layer(self, prev_layer, curr_layer): + """Matches dimension of prev_layer to the curr_layer.""" + # Set the prev layer to the current layer if it is none + if prev_layer is None: + return curr_layer + curr_num_filters = self._filter_size + prev_num_filters = get_channel_dim(prev_layer.shape) + curr_filter_shape = int(curr_layer.shape[2]) + prev_filter_shape = int(prev_layer.shape[2]) + if curr_filter_shape != prev_filter_shape: + prev_layer = tf.nn.relu(prev_layer) + prev_layer = factorized_reduction(prev_layer, curr_num_filters, stride=2) + elif curr_num_filters != prev_num_filters: + prev_layer = tf.nn.relu(prev_layer) + prev_layer = slim.conv2d( + prev_layer, curr_num_filters, 1, scope='prev_1x1') + prev_layer = slim.batch_norm(prev_layer, scope='prev_bn') + return prev_layer + + def _cell_base(self, net, prev_layer): + """Runs the beginning of the conv cell before the predicted ops are run.""" + num_filters = self._filter_size + + # Check to be sure prev layer stuff is setup correctly + prev_layer = self._reduce_prev_layer(prev_layer, net) + + net = tf.nn.relu(net) + net = slim.conv2d(net, num_filters, 1, scope='1x1') + net = slim.batch_norm(net, scope='beginning_bn') + split_axis = get_channel_index() + net = tf.split(axis=split_axis, num_or_size_splits=1, value=net) + for split in net: + assert int(split.shape[split_axis] == int( + self._num_conv_filters * self._filter_scaling)) + net.append(prev_layer) + return net + + def __call__(self, + net, + scope=None, + filter_scaling=1, + stride=1, + prev_layer=None, + cell_num=-1): + """Runs the conv cell.""" + self._cell_num = cell_num + self._filter_scaling = filter_scaling + self._filter_size = int(self._num_conv_filters * filter_scaling) + + i = 0 + with tf.variable_scope(scope): + net = self._cell_base(net, prev_layer) + for iteration in range(5): + with tf.variable_scope('comb_iter_{}'.format(iteration)): + left_hiddenstate_idx, right_hiddenstate_idx = ( + self._hiddenstate_indices[i], self._hiddenstate_indices[i + 1]) + original_input_left = left_hiddenstate_idx < 2 + original_input_right = right_hiddenstate_idx < 2 + h1 = net[left_hiddenstate_idx] + h2 = net[right_hiddenstate_idx] + + operation_left = self._operations[i] + operation_right = self._operations[i + 1] + i += 2 + # Apply conv operations + with tf.variable_scope('left'): + h1 = self._apply_conv_operation(h1, operation_left, stride, + original_input_left) + with tf.variable_scope('right'): + h2 = self._apply_conv_operation(h2, operation_right, stride, + original_input_right) + + # Combine hidden states using 'add'. + with tf.variable_scope('combine'): + h = h1 + h2 + + # Add hiddenstate to the list of hiddenstates we can choose from + net.append(h) + + with tf.variable_scope('cell_output'): + net = self._combine_unused_states(net) + + return net + + def _apply_conv_operation(self, net, operation, stride, + is_from_original_input): + """Applies the predicted conv operation to net.""" + # Dont stride if this is not one of the original hiddenstates + if stride > 1 and not is_from_original_input: + stride = 1 + input_filters = get_channel_dim(net.shape) + filter_size = self._filter_size + if 'separable' in operation: + net = _stacked_separable_conv(net, stride, operation, filter_size) + elif operation in ['none']: + # Check if a stride is needed, then use a strided 1x1 here + if stride > 1 or (input_filters != filter_size): + net = tf.nn.relu(net) + net = slim.conv2d(net, filter_size, 1, stride=stride, scope='1x1') + net = slim.batch_norm(net, scope='bn_1') + elif 'pool' in operation: + net = _pooling(net, stride, operation) + if input_filters != filter_size: + net = slim.conv2d(net, filter_size, 1, stride=1, scope='1x1') + net = slim.batch_norm(net, scope='bn_1') + else: + raise ValueError('Unimplemented operation', operation) + + if operation != 'none': + net = self._apply_drop_path(net) + return net + + def _combine_unused_states(self, net): + """Concatenate the unused hidden states of the cell.""" + used_hiddenstates = self._used_hiddenstates + + final_height = int(net[-1].shape[2]) + final_num_filters = get_channel_dim(net[-1].shape) + assert len(used_hiddenstates) == len(net) + for idx, used_h in enumerate(used_hiddenstates): + curr_height = int(net[idx].shape[2]) + curr_num_filters = get_channel_dim(net[idx].shape) + + # Determine if a reduction should be applied to make the number of + # filters match. + should_reduce = final_num_filters != curr_num_filters + should_reduce = (final_height != curr_height) or should_reduce + should_reduce = should_reduce and not used_h + if should_reduce: + stride = 2 if final_height != curr_height else 1 + with tf.variable_scope('reduction_{}'.format(idx)): + net[idx] = factorized_reduction(net[idx], final_num_filters, stride) + + states_to_combine = ([ + h for h, is_used in zip(net, used_hiddenstates) if not is_used + ]) + + # Return the concat of all the states + concat_axis = get_channel_index() + net = tf.concat(values=states_to_combine, axis=concat_axis) + return net + + @contrib_framework.add_arg_scope # No public API. For internal use only. + def _apply_drop_path(self, + net, + current_step=None, + use_summaries=True, + drop_connect_version='v3'): + """Apply drop_path regularization. + + Args: + net: the Tensor that gets drop_path regularization applied. + current_step: a float32 Tensor with the current global_step value, + to be divided by hparams.total_training_steps. Usually None, which + defaults to tf.train.get_or_create_global_step() properly casted. + use_summaries: a Python boolean. If set to False, no summaries are output. + drop_connect_version: one of 'v1', 'v2', 'v3', controlling whether + the dropout rate is scaled by current_step (v1), layer (v2), or + both (v3, the default). + + Returns: + The dropped-out value of `net`. + """ + drop_path_keep_prob = self._drop_path_keep_prob + if drop_path_keep_prob < 1.0: + assert drop_connect_version in ['v1', 'v2', 'v3'] + if drop_connect_version in ['v2', 'v3']: + # Scale keep prob by layer number + assert self._cell_num != -1 + # The added 2 is for the reduction cells + num_cells = self._total_num_cells + layer_ratio = (self._cell_num + 1) / float(num_cells) + if use_summaries: + with tf.device('/cpu:0'): + tf.summary.scalar('layer_ratio', layer_ratio) + drop_path_keep_prob = 1 - layer_ratio * (1 - drop_path_keep_prob) + if drop_connect_version in ['v1', 'v3']: + # Decrease the keep probability over time + if not current_step: + current_step = tf.cast(tf.train.get_or_create_global_step(), + tf.float32) + drop_path_burn_in_steps = self._total_training_steps + current_ratio = current_step / drop_path_burn_in_steps + current_ratio = tf.minimum(1.0, current_ratio) + if use_summaries: + with tf.device('/cpu:0'): + tf.summary.scalar('current_ratio', current_ratio) + drop_path_keep_prob = (1 - current_ratio * (1 - drop_path_keep_prob)) + if use_summaries: + with tf.device('/cpu:0'): + tf.summary.scalar('drop_path_keep_prob', drop_path_keep_prob) + net = drop_path(net, drop_path_keep_prob) + return net + + +class NasNetANormalCell(NasNetABaseCell): + """NASNetA Normal Cell.""" + + def __init__(self, num_conv_filters, drop_path_keep_prob, total_num_cells, + total_training_steps): + operations = [ + 'separable_5x5_2', 'separable_3x3_2', 'separable_5x5_2', + 'separable_3x3_2', 'avg_pool_3x3', 'none', 'avg_pool_3x3', + 'avg_pool_3x3', 'separable_3x3_2', 'none' + ] + used_hiddenstates = [1, 0, 0, 0, 0, 0, 0] + hiddenstate_indices = [0, 1, 1, 1, 0, 1, 1, 1, 0, 0] + super(NasNetANormalCell, self).__init__( + num_conv_filters, operations, used_hiddenstates, hiddenstate_indices, + drop_path_keep_prob, total_num_cells, total_training_steps) + + +class NasNetAReductionCell(NasNetABaseCell): + """NASNetA Reduction Cell.""" + + def __init__(self, num_conv_filters, drop_path_keep_prob, total_num_cells, + total_training_steps): + operations = [ + 'separable_5x5_2', 'separable_7x7_2', 'max_pool_3x3', 'separable_7x7_2', + 'avg_pool_3x3', 'separable_5x5_2', 'none', 'avg_pool_3x3', + 'separable_3x3_2', 'max_pool_3x3' + ] + used_hiddenstates = [1, 1, 1, 0, 0, 0, 0] + hiddenstate_indices = [0, 1, 0, 1, 0, 1, 3, 2, 2, 0] + super(NasNetAReductionCell, self).__init__( + num_conv_filters, operations, used_hiddenstates, hiddenstate_indices, + drop_path_keep_prob, total_num_cells, total_training_steps) diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/ssd_model.py b/cv/classification/resnet50/tensorflow/models/tf1_only/ssd_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3d959d5be5ccf2d0197196ef46e113665f06b258 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/tf1_only/ssd_model.py @@ -0,0 +1,683 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +"""SSD300 Model Configuration. + +References: + Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, + Cheng-Yang Fu, Alexander C. Berg + SSD: Single Shot MultiBox Detector + arXiv:1512.02325 + +Ported from MLPerf reference implementation: + https://github.com/mlperf/reference/tree/ssd/single_stage_detector/ssd + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import multiprocessing +import os +import re +import threading +import tensorflow.compat.v1 as tf + +# pylint: disable=g-direct-tensorflow-import +import constants +import mlperf +import ssd_constants +from cnn_util import log_fn +from models import model as model_lib +from models import resnet_model +from tensorflow.contrib import layers as contrib_layers +from tensorflow.python.ops import variables + +BACKBONE_MODEL_SCOPE_NAME = 'resnet34_backbone' + + +class SSD300Model(model_lib.CNNModel): + """Single Shot Multibox Detection (SSD) model for 300x300 image datasets.""" + + def __init__(self, label_num=ssd_constants.NUM_CLASSES, batch_size=32, + learning_rate=1e-3, backbone='resnet34', params=None): + super(SSD300Model, self).__init__('ssd300', 300, batch_size, learning_rate, + params=params) + # For COCO dataset, 80 categories + 1 background = 81 labels + self.label_num = label_num + + # Currently only support ResNet-34 as backbone model + if backbone != 'resnet34': + raise ValueError('Invalid backbone model %s for SSD.' % backbone) + mlperf.logger.log(key=mlperf.tags.BACKBONE, value=backbone) + + # Number of channels and default boxes associated with the following layers: + # ResNet34 layer, Conv7, Conv8_2, Conv9_2, Conv10_2, Conv11_2 + self.out_chan = [256, 512, 512, 256, 256, 256] + mlperf.logger.log(key=mlperf.tags.LOC_CONF_OUT_CHANNELS, + value=self.out_chan) + + # Number of default boxes from layers of different scales + # 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4 + self.num_dboxes = [4, 6, 6, 6, 4, 4] + mlperf.logger.log(key=mlperf.tags.NUM_DEFAULTS_PER_CELL, + value=self.num_dboxes) + + # TODO(haoyuzhang): in order to correctly restore in replicated mode, need + # to create a saver for each tower before graph is finalized. Use variable + # manager for better efficiency. + self.backbone_savers = [] + + # Collected predictions for eval stage. It maps each image id in eval + # dataset to a dict containing the following information: + # source_id: raw ID of image + # raw_shape: raw shape of image + # pred_box: encoded box coordinates of prediction + # pred_scores: scores of classes in prediction + self.predictions = {} + + # Global step when predictions are collected. + self.eval_global_step = 0 + + # Average precision. In asynchronous eval mode, this is the latest AP we + # get so far and may not be the results at current eval step. + self.eval_coco_ap = 0 + + # Process, queues, and thread for asynchronous evaluation. When enabled, + # create a separate process (async_eval_process) that continuously pull + # intermediate results from the predictions queue (a multiprocessing queue), + # process them, and push final results into results queue (another + # multiprocessing queue). The main thread is responsible to push message + # into predictions queue, and start a separate thread to continuously pull + # messages from results queue to update final results. + # Message in predictions queue should be a tuple of two elements: + # (evaluation step, predictions) + # Message in results queue should be a tuple of two elements: + # (evaluation step, final results) + self.async_eval_process = None + self.async_eval_predictions_queue = None + self.async_eval_results_queue = None + self.async_eval_results_getter_thread = None + + # The MLPerf reference uses a starting lr of 1e-3 at bs=32. + self.base_lr_batch_size = 32 + + def skip_final_affine_layer(self): + return True + + def gpu_preprocess_nhwc(self, images, phase_train=True): + try: + import ssd_dataloader # pylint: disable=g-import-not-at-top + except ImportError: + raise ImportError('To use the COCO dataset, you must clone the ' + 'repo https://github.com/tensorflow/models and add ' + 'tensorflow/models and tensorflow/models/research to ' + 'the PYTHONPATH, and compile the protobufs by ' + 'following https://github.com/tensorflow/models/blob/' + 'master/research/object_detection/g3doc/installation.md' + '#protobuf-compilation ; To evaluate using COCO' + 'metric, download and install Python COCO API from' + 'https://github.com/cocodataset/cocoapi') + + if phase_train: + images = ssd_dataloader.color_jitter( + images, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) + images = ssd_dataloader.normalize_image(images) + return images + + def add_backbone_model(self, cnn): + # -------------------------------------------------------------------------- + # Resnet-34 backbone model -- modified for SSD + # -------------------------------------------------------------------------- + + # Input 300x300, output 150x150 + cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True) + cnn.mpool(3, 3, 2, 2, mode='SAME') + + resnet34_layers = [3, 4, 6, 3] + version = 'v1' + + # ResNet-34 block group 1 + # Input 150x150, output 75x75 + for i in range(resnet34_layers[0]): + # Last argument forces residual_block to use projection shortcut, even + # though the numbers of input and output channels are equal + resnet_model.residual_block(cnn, 64, 1, version) + + # ResNet-34 block group 2 + # Input 75x75, output 38x38 + for i in range(resnet34_layers[1]): + stride = 2 if i == 0 else 1 + resnet_model.residual_block(cnn, 128, stride, version, i == 0) + + # ResNet-34 block group 3 + # This block group is modified: first layer uses stride=1 so that the image + # size does not change in group of layers + # Input 38x38, output 38x38 + for i in range(resnet34_layers[2]): + # The following line is intentionally commented out to differentiate from + # the original ResNet-34 model + # stride = 2 if i == 0 else 1 + resnet_model.residual_block(cnn, 256, stride, version, i == 0) + + # ResNet-34 block group 4: removed final block group + # The following 3 lines are intentionally commented out to differentiate + # from the original ResNet-34 model + # for i in range(resnet34_layers[3]): + # stride = 2 if i == 0 else 1 + # resnet_model.residual_block(cnn, 512, stride, version, i == 0) + + def add_inference(self, cnn): + cnn.use_batch_norm = True + cnn.batch_norm_config = {'decay': ssd_constants.BATCH_NORM_DECAY, + 'epsilon': ssd_constants.BATCH_NORM_EPSILON, + 'scale': True} + + with tf.variable_scope(BACKBONE_MODEL_SCOPE_NAME): + self.add_backbone_model(cnn) + + # -------------------------------------------------------------------------- + # SSD additional layers + # -------------------------------------------------------------------------- + + def add_ssd_layer(cnn, depth, k_size, stride, mode): + return cnn.conv( + depth, + k_size, + k_size, + stride, + stride, + mode=mode, + use_batch_norm=False, + kernel_initializer=contrib_layers.xavier_initializer()) + + # Activations for feature maps of different layers + self.activations = [cnn.top_layer] + # Conv7_1, Conv7_2 + # Input 38x38, output 19x19 + add_ssd_layer(cnn, 256, 1, 1, 'valid') + self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same')) + + # Conv8_1, Conv8_2 + # Input 19x19, output 10x10 + add_ssd_layer(cnn, 256, 1, 1, 'valid') + self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same')) + + # Conv9_1, Conv9_2 + # Input 10x10, output 5x5 + add_ssd_layer(cnn, 128, 1, 1, 'valid') + self.activations.append(add_ssd_layer(cnn, 256, 3, 2, 'same')) + + # Conv10_1, Conv10_2 + # Input 5x5, output 3x3 + add_ssd_layer(cnn, 128, 1, 1, 'valid') + self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid')) + + # Conv11_1, Conv11_2 + # Input 3x3, output 1x1 + add_ssd_layer(cnn, 128, 1, 1, 'valid') + self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid')) + + self.loc = [] + self.conf = [] + + for nd, ac, oc in zip(self.num_dboxes, self.activations, self.out_chan): + l = cnn.conv( + nd * 4, + 3, + 3, + 1, + 1, + input_layer=ac, + num_channels_in=oc, + activation=None, + use_batch_norm=False, + kernel_initializer=contrib_layers.xavier_initializer()) + scale = l.get_shape()[-1] + # shape = [batch_size, nd * 4, scale, scale] + l = tf.reshape(l, [self.batch_size, nd, 4, scale, scale]) + # shape = [batch_size, nd, 4, scale, scale] + l = tf.transpose(l, [0, 1, 3, 4, 2]) + # shape = [batch_size, nd, scale, scale, 4] + self.loc.append(tf.reshape(l, [self.batch_size, -1, 4])) + # shape = [batch_size, nd * scale * scale, 4] + + c = cnn.conv( + nd * self.label_num, + 3, + 3, + 1, + 1, + input_layer=ac, + num_channels_in=oc, + activation=None, + use_batch_norm=False, + kernel_initializer=contrib_layers.xavier_initializer()) + # shape = [batch_size, nd * label_num, scale, scale] + c = tf.reshape(c, [self.batch_size, nd, self.label_num, scale, scale]) + # shape = [batch_size, nd, label_num, scale, scale] + c = tf.transpose(c, [0, 1, 3, 4, 2]) + # shape = [batch_size, nd, scale, scale, label_num] + self.conf.append(tf.reshape(c, [self.batch_size, -1, self.label_num])) + # shape = [batch_size, nd * scale * scale, label_num] + + # Shape of locs: [batch_size, NUM_SSD_BOXES, 4] + # Shape of confs: [batch_size, NUM_SSD_BOXES, label_num] + locs, confs = tf.concat(self.loc, 1), tf.concat(self.conf, 1) + + # Pack location and confidence outputs into a single output layer + # Shape of logits: [batch_size, NUM_SSD_BOXES, 4+label_num] + logits = tf.concat([locs, confs], 2) + + cnn.top_layer = logits + cnn.top_size = 4 + self.label_num + + return cnn.top_layer + + def get_learning_rate(self, global_step, batch_size): + rescaled_lr = self.get_scaled_base_learning_rate(batch_size) + # Defined in MLPerf reference model + boundaries = [160000, 200000] + boundaries = [b * self.base_lr_batch_size // batch_size for b in boundaries] + decays = [1, 0.1, 0.01] + learning_rates = [rescaled_lr * d for d in decays] + lr = tf.train.piecewise_constant(global_step, boundaries, learning_rates) + warmup_steps = int(118287 / batch_size * 5) + warmup_lr = ( + rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast( + warmup_steps, tf.float32)) + return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr) + + def get_scaled_base_learning_rate(self, batch_size): + """Calculates base learning rate for creating lr schedule. + + In replicated mode, gradients are summed rather than averaged which, with + the sgd and momentum optimizers, increases the effective learning rate by + lr * num_gpus. Dividing the base lr by num_gpus negates the increase. + + Args: + batch_size: Total batch-size. + + Returns: + Base learning rate to use to create lr schedule. + """ + base_lr = self.learning_rate + if self.params.variable_update == 'replicated': + base_lr = self.learning_rate / self.params.num_gpus + scaled_lr = base_lr * (batch_size / self.base_lr_batch_size) + return scaled_lr + + def _collect_backbone_vars(self): + backbone_vars = tf.get_collection( + tf.GraphKeys.GLOBAL_VARIABLES, scope='.*'+ BACKBONE_MODEL_SCOPE_NAME) + var_list = {} + + # Assume variables in the checkpoint are following the naming convention of + # a model checkpoint trained with TF official model + # TODO(haoyuzhang): the following variable name parsing is hacky and easy + # to break if there is change in naming convention of either benchmarks or + # official models. + for v in backbone_vars: + # conv2d variable example (model <-- checkpoint): + # v/cg/conv24/conv2d/kernel:0 <-- conv2d_24/kernel + if 'conv2d' in v.name: + re_match = re.search(r'conv(\d+)/conv2d/(.+):', v.name) + if re_match: + layer_id = int(re_match.group(1)) + param_name = re_match.group(2) + vname_in_ckpt = self._var_name_in_official_model_ckpt( + 'conv2d', layer_id, param_name) + var_list[vname_in_ckpt] = v + + # batchnorm varariable example: + # v/cg/conv24/batchnorm25/gamma:0 <-- batch_normalization_25/gamma + elif 'batchnorm' in v.name: + re_match = re.search(r'batchnorm(\d+)/(.+):', v.name) + if re_match: + layer_id = int(re_match.group(1)) + param_name = re_match.group(2) + vname_in_ckpt = self._var_name_in_official_model_ckpt( + 'batch_normalization', layer_id, param_name) + var_list[vname_in_ckpt] = v + + return var_list + + def _var_name_in_official_model_ckpt(self, layer_name, layer_id, param_name): + """Return variable names according to convention in TF official models.""" + vname_in_ckpt = layer_name + if layer_id > 0: + vname_in_ckpt += '_' + str(layer_id) + vname_in_ckpt += '/' + param_name + return vname_in_ckpt + + def loss_function(self, inputs, build_network_result): + logits = build_network_result.logits + + # Unpack model output back to locations and confidence scores of predictions + # Shape of pred_loc: [batch_size, NUM_SSD_BOXES, 4] + # Shape of pred_label: [batch_size, NUM_SSD_BOXES, label_num] + pred_loc, pred_label = tf.split(logits, [4, self.label_num], 2) + + # Shape of gt_loc: [batch_size, NUM_SSD_BOXES, 4] + # Shape of gt_label: [batch_size, NUM_SSD_BOXES, 1] + # Shape of num_gt: [batch_size] + _, gt_loc, gt_label, num_gt = inputs + gt_label = tf.cast(gt_label, tf.int32) + + box_loss = self._localization_loss(pred_loc, gt_loc, gt_label, num_gt) + class_loss = self._classification_loss(pred_label, gt_label, num_gt) + + tf.summary.scalar('box_loss', tf.reduce_mean(box_loss)) + tf.summary.scalar('class_loss', tf.reduce_mean(class_loss)) + return class_loss + box_loss + + def _localization_loss(self, pred_loc, gt_loc, gt_label, num_matched_boxes): + """Computes the localization loss. + + Computes the localization loss using smooth l1 loss. + Args: + pred_loc: a flatten tensor that includes all predicted locations. The + shape is [batch_size, num_anchors, 4]. + gt_loc: a tensor representing box regression targets in + [batch_size, num_anchors, 4]. + gt_label: a tensor that represents the classification groundtruth targets. + The shape is [batch_size, num_anchors, 1]. + num_matched_boxes: the number of anchors that are matched to a groundtruth + targets, used as the loss normalizater. The shape is [batch_size]. + Returns: + box_loss: a float32 representing total box regression loss. + """ + mask = tf.greater(tf.squeeze(gt_label), 0) + float_mask = tf.cast(mask, tf.float32) + + smooth_l1 = tf.reduce_sum(tf.losses.huber_loss( + gt_loc, pred_loc, + reduction=tf.losses.Reduction.NONE + ), axis=2) + smooth_l1 = tf.multiply(smooth_l1, float_mask) + box_loss = tf.reduce_sum(smooth_l1, axis=1) + + return tf.reduce_mean(box_loss / num_matched_boxes) + + def _classification_loss(self, pred_label, gt_label, num_matched_boxes): + """Computes the classification loss. + + Computes the classification loss with hard negative mining. + Args: + pred_label: a flatten tensor that includes all predicted class. The shape + is [batch_size, num_anchors, num_classes]. + gt_label: a tensor that represents the classification groundtruth targets. + The shape is [batch_size, num_anchors, 1]. + num_matched_boxes: the number of anchors that are matched to a groundtruth + targets. This is used as the loss normalizater. + + Returns: + box_loss: a float32 representing total box regression loss. + """ + cross_entropy = tf.losses.sparse_softmax_cross_entropy( + gt_label, pred_label, reduction=tf.losses.Reduction.NONE) + + mask = tf.greater(tf.squeeze(gt_label), 0) + float_mask = tf.cast(mask, tf.float32) + + # Hard example mining + neg_masked_cross_entropy = cross_entropy * (1 - float_mask) + relative_position = tf.argsort( + tf.argsort( + neg_masked_cross_entropy, direction='DESCENDING')) + num_neg_boxes = tf.minimum( + tf.to_int32(num_matched_boxes) * ssd_constants.NEGS_PER_POSITIVE, + ssd_constants.NUM_SSD_BOXES) + top_k_neg_mask = tf.cast(tf.less( + relative_position, + tf.tile(num_neg_boxes[:, tf.newaxis], (1, ssd_constants.NUM_SSD_BOXES)) + ), tf.float32) + + class_loss = tf.reduce_sum( + tf.multiply(cross_entropy, float_mask + top_k_neg_mask), axis=1) + + return tf.reduce_mean(class_loss / num_matched_boxes) + + def add_backbone_saver(self): + # Create saver with mapping from variable names in checkpoint of backbone + # model to variables in SSD model + backbone_var_list = self._collect_backbone_vars() + self.backbone_savers.append(tf.train.Saver(backbone_var_list)) + + def load_backbone_model(self, sess, backbone_model_path): + for saver in self.backbone_savers: + saver.restore(sess, backbone_model_path) + + def get_input_data_types(self, subset): + if subset == 'validation': + return [self.data_type, tf.float32, tf.float32, tf.float32, tf.int32] + return [self.data_type, tf.float32, tf.float32, tf.float32] + + def get_input_shapes(self, subset): + """Return encoded tensor shapes for train and eval data respectively.""" + if subset == 'validation': + # Validation data shapes: + # 1. images + # 2. ground truth locations of boxes + # 3. ground truth classes of objects in boxes + # 4. source image IDs + # 5. raw image shapes + return [ + [self.batch_size, self.image_size, self.image_size, self.depth], + [self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 4], + [self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 1], + [self.batch_size], + [self.batch_size, 3], + ] + + # Training data shapes: + # 1. images + # 2. ground truth locations of boxes + # 3. ground truth classes of objects in boxes + # 4. numbers of objects in images + return [ + [self.batch_size, self.image_size, self.image_size, self.depth], + [self.batch_size, ssd_constants.NUM_SSD_BOXES, 4], + [self.batch_size, ssd_constants.NUM_SSD_BOXES, 1], + [self.batch_size] + ] + + def accuracy_function(self, inputs, logits): + """Returns the ops to measure the mean precision of the model.""" + try: + import ssd_dataloader # pylint: disable=g-import-not-at-top + from object_detection.box_coders import faster_rcnn_box_coder # pylint: disable=g-import-not-at-top + from object_detection.core import box_coder # pylint: disable=g-import-not-at-top + from object_detection.core import box_list # pylint: disable=g-import-not-at-top + except ImportError: + raise ImportError('To use the COCO dataset, you must clone the ' + 'repo https://github.com/tensorflow/models and add ' + 'tensorflow/models and tensorflow/models/research to ' + 'the PYTHONPATH, and compile the protobufs by ' + 'following https://github.com/tensorflow/models/blob/' + 'master/research/object_detection/g3doc/installation.md' + '#protobuf-compilation ; To evaluate using COCO' + 'metric, download and install Python COCO API from' + 'https://github.com/cocodataset/cocoapi') + + # Unpack model output back to locations and confidence scores of predictions + # pred_locs: relative locations (coordinates) of objects in all SSD boxes + # shape: [batch_size, NUM_SSD_BOXES, 4] + # pred_labels: confidence scores of objects being of all categories + # shape: [batch_size, NUM_SSD_BOXES, label_num] + pred_locs, pred_labels = tf.split(logits, [4, self.label_num], 2) + + ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( + scale_factors=ssd_constants.BOX_CODER_SCALES) + anchors = box_list.BoxList( + tf.convert_to_tensor(ssd_dataloader.DefaultBoxes()('ltrb'))) + pred_boxes = box_coder.batch_decode( + encoded_boxes=pred_locs, box_coder=ssd_box_coder, anchors=anchors) + + pred_scores = tf.nn.softmax(pred_labels, axis=2) + + # TODO(haoyuzhang): maybe use `gt_boxes` and `gt_classes` for visualization. + _, gt_boxes, gt_classes, source_id, raw_shape = inputs # pylint: disable=unused-variable + + return { + (constants.UNREDUCED_ACCURACY_OP_PREFIX + + ssd_constants.PRED_BOXES): pred_boxes, + (constants.UNREDUCED_ACCURACY_OP_PREFIX + + ssd_constants.PRED_SCORES): pred_scores, + # TODO(haoyuzhang): maybe use these values for visualization. + # constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_boxes': gt_boxes, + # constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_classes': gt_classes, + (constants.UNREDUCED_ACCURACY_OP_PREFIX + + ssd_constants.SOURCE_ID): source_id, + (constants.UNREDUCED_ACCURACY_OP_PREFIX + + ssd_constants.RAW_SHAPE): raw_shape + } + + def postprocess(self, results): + """Postprocess results returned from model.""" + try: + import coco_metric # pylint: disable=g-import-not-at-top + except ImportError: + raise ImportError('To use the COCO dataset, you must clone the ' + 'repo https://github.com/tensorflow/models and add ' + 'tensorflow/models and tensorflow/models/research to ' + 'the PYTHONPATH, and compile the protobufs by ' + 'following https://github.com/tensorflow/models/blob/' + 'master/research/object_detection/g3doc/installation.md' + '#protobuf-compilation ; To evaluate using COCO' + 'metric, download and install Python COCO API from' + 'https://github.com/cocodataset/cocoapi') + + pred_boxes = results[ssd_constants.PRED_BOXES] + pred_scores = results[ssd_constants.PRED_SCORES] + # TODO(haoyuzhang): maybe use these values for visualization. + # gt_boxes = results['gt_boxes'] + # gt_classes = results['gt_classes'] + source_id = results[ssd_constants.SOURCE_ID] + raw_shape = results[ssd_constants.RAW_SHAPE] + + # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due + # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting + # `num_eval_epochs` to 1 is not enough and will often miss some images. We + # expect user to set `num_eval_epochs` to >1, which will leave some unused + # images from previous steps in `predictions`. Here we check if we are doing + # eval at a new global step. + if results['global_step'] > self.eval_global_step: + self.eval_global_step = results['global_step'] + self.predictions.clear() + + for i, sid in enumerate(source_id): + self.predictions[int(sid)] = { + ssd_constants.PRED_BOXES: pred_boxes[i], + ssd_constants.PRED_SCORES: pred_scores[i], + ssd_constants.SOURCE_ID: source_id[i], + ssd_constants.RAW_SHAPE: raw_shape[i] + } + + # COCO metric calculates mAP only after a full epoch of evaluation. Return + # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py. + if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES: + log_fn('Got results for all {:d} eval examples. Calculate mAP...'.format( + ssd_constants.COCO_NUM_VAL_IMAGES)) + + annotation_file = os.path.join(self.params.data_dir, + ssd_constants.ANNOTATION_FILE) + # Size of predictions before decoding about 15--30GB, while size after + # decoding is 100--200MB. When using async eval mode, decoding takes + # 20--30 seconds of main thread time but is necessary to avoid OOM during + # inter-process communication. + decoded_preds = coco_metric.decode_predictions(self.predictions.values()) + self.predictions.clear() + + if self.params.collect_eval_results_async: + def _eval_results_getter(): + """Iteratively get eval results from async eval process.""" + while True: + step, eval_results = self.async_eval_results_queue.get() + self.eval_coco_ap = eval_results['COCO/AP'] + mlperf.logger.log_eval_accuracy( + self.eval_coco_ap, step, self.batch_size * self.params.num_gpus, + ssd_constants.COCO_NUM_TRAIN_IMAGES) + if self.reached_target(): + # Reached target, clear all pending messages in predictions queue + # and insert poison pill to stop the async eval process. + while not self.async_eval_predictions_queue.empty(): + self.async_eval_predictions_queue.get() + self.async_eval_predictions_queue.put('STOP') + break + + if not self.async_eval_process: + # Limiting the number of messages in predictions queue to prevent OOM. + # Each message (predictions data) can potentially consume a lot of + # memory, and normally there should only be few messages in the queue. + # If often blocked on this, consider reducing eval frequency. + self.async_eval_predictions_queue = multiprocessing.Queue(2) + self.async_eval_results_queue = multiprocessing.Queue() + + # Reason to use a Process as opposed to Thread is mainly the + # computationally intensive eval runner. Python multithreading is not + # truly running in parallel, a runner thread would get significantly + # delayed (or alternatively delay the main thread). + self.async_eval_process = multiprocessing.Process( + target=coco_metric.async_eval_runner, + args=(self.async_eval_predictions_queue, + self.async_eval_results_queue, + annotation_file)) + self.async_eval_process.daemon = True + self.async_eval_process.start() + + self.async_eval_results_getter_thread = threading.Thread( + target=_eval_results_getter, args=()) + self.async_eval_results_getter_thread.daemon = True + self.async_eval_results_getter_thread.start() + + self.async_eval_predictions_queue.put( + (self.eval_global_step, decoded_preds)) + return {'top_1_accuracy': 0, 'top_5_accuracy': 0.} + + eval_results = coco_metric.compute_map(decoded_preds, annotation_file) + self.eval_coco_ap = eval_results['COCO/AP'] + ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.} + for metric_key, metric_value in eval_results.items(): + ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value + mlperf.logger.log_eval_accuracy(self.eval_coco_ap, self.eval_global_step, + self.batch_size * self.params.num_gpus, + ssd_constants.COCO_NUM_TRAIN_IMAGES) + return ret + log_fn('Got {:d} out of {:d} eval examples.' + ' Waiting for the remaining to calculate mAP...'.format( + len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES)) + return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.} + + def get_synthetic_inputs(self, input_name, nclass): + """Generating synthetic data matching real data shape and type.""" + inputs = tf.random_uniform( + self.get_input_shapes('train')[0], dtype=self.data_type) + inputs = variables.VariableV1(inputs, trainable=False, + collections=[tf.GraphKeys.LOCAL_VARIABLES], + name=input_name) + boxes = tf.random_uniform( + [self.batch_size, ssd_constants.NUM_SSD_BOXES, 4], dtype=tf.float32) + classes = tf.random_uniform( + [self.batch_size, ssd_constants.NUM_SSD_BOXES, 1], dtype=tf.float32) + nboxes = tf.random_uniform( + [self.batch_size], minval=1, maxval=10, dtype=tf.float32) + return (inputs, boxes, classes, nboxes) + + def reached_target(self): + return (self.params.stop_at_top_1_accuracy and + self.eval_coco_ap >= self.params.stop_at_top_1_accuracy) diff --git a/cv/classification/resnet50/tensorflow/models/trivial_model.py b/cv/classification/resnet50/tensorflow/models/trivial_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba84d72672c6e3c0903c9af2d0dddecdd7fa2c1 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/trivial_model.py @@ -0,0 +1,73 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Trivial model configuration.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf +from models import model + + +class TrivialModel(model.CNNModel): + """Trivial model configuration.""" + + def __init__(self, params=None): + super(TrivialModel, self).__init__( + 'trivial', 224 + 3, 32, 0.005, params=params) + + def add_inference(self, cnn): + cnn.reshape([-1, 227 * 227 * 3]) + cnn.affine(1) + cnn.affine(4096) + + +class TrivialCifar10Model(model.CNNModel): + """Trivial cifar10 model configuration.""" + + def __init__(self, params=None): + super(TrivialCifar10Model, self).__init__( + 'trivial', 32, 32, 0.005, params=params) + + def add_inference(self, cnn): + cnn.reshape([-1, 32 * 32 * 3]) + cnn.affine(1) + cnn.affine(4096) + + +class TrivialSSD300Model(model.CNNModel): + """Trivial SSD300 model configuration.""" + + def __init__(self, params=None): + super(TrivialSSD300Model, self).__init__( + 'trivial', 300, params.batch_size, 0.005, params=params) + + def add_inference(self, cnn): + cnn.reshape([-1, 300 * 300 * 3]) + cnn.affine(1) + cnn.affine(4096) + + def get_input_shapes(self, subset): + return [[self.batch_size, 300, 300, 3], + [self.batch_size, 8732, 4], + [self.batch_size, 8732, 1], + [self.batch_size]] + + def loss_function(self, inputs, build_network_result): + images, _, _, labels = inputs + labels = tf.cast(labels, tf.int32) + return super(TrivialSSD300Model, self).loss_function( + (images, labels), build_network_result) diff --git a/cv/classification/resnet50/tensorflow/models/vgg_model.py b/cv/classification/resnet50/tensorflow/models/vgg_model.py new file mode 100644 index 0000000000000000000000000000000000000000..938385c95bbc916ca8677bca232085334a48bbf4 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/models/vgg_model.py @@ -0,0 +1,83 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Vgg model configuration. + +Includes multiple models: vgg11, vgg16, vgg19, corresponding to + model A, D, and E in Table 1 of [1]. + +References: +[1] Simonyan, Karen, Andrew Zisserman + Very Deep Convolutional Networks for Large-Scale Image Recognition + arXiv:1409.1556 (2014) +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from six.moves import xrange # pylint: disable=redefined-builtin +from models import model + + +def _construct_vgg(cnn, num_conv_layers): + """Build vgg architecture from blocks.""" + assert len(num_conv_layers) == 5 + for _ in xrange(num_conv_layers[0]): + cnn.conv(64, 3, 3) + cnn.mpool(2, 2) + for _ in xrange(num_conv_layers[1]): + cnn.conv(128, 3, 3) + cnn.mpool(2, 2) + for _ in xrange(num_conv_layers[2]): + cnn.conv(256, 3, 3) + cnn.mpool(2, 2) + for _ in xrange(num_conv_layers[3]): + cnn.conv(512, 3, 3) + cnn.mpool(2, 2) + for _ in xrange(num_conv_layers[4]): + cnn.conv(512, 3, 3) + cnn.mpool(2, 2) + cnn.reshape([-1, 512 * 7 * 7]) + cnn.affine(4096) + cnn.dropout() + cnn.affine(4096) + cnn.dropout() + + +class Vgg11Model(model.CNNModel): + + def __init__(self, params=None): + super(Vgg11Model, self).__init__('vgg11', 224, 64, 0.005, params=params) + + def add_inference(self, cnn): + _construct_vgg(cnn, [1, 1, 2, 2, 2]) + + +class Vgg16Model(model.CNNModel): + + def __init__(self, params=None): + super(Vgg16Model, self).__init__('vgg16', 224, 64, 0.005, params=params) + + def add_inference(self, cnn): + _construct_vgg(cnn, [2, 2, 3, 3, 3]) + + +class Vgg19Model(model.CNNModel): + + def __init__(self, params=None): + super(Vgg19Model, self).__init__('vgg19', 224, 64, 0.005, params=params) + + def add_inference(self, cnn): + _construct_vgg(cnn, [2, 2, 4, 4, 4]) diff --git a/cv/classification/resnet50/tensorflow/platforms/__init__.py b/cv/classification/resnet50/tensorflow/platforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/classification/resnet50/tensorflow/platforms/default/__init__.py b/cv/classification/resnet50/tensorflow/platforms/default/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/classification/resnet50/tensorflow/platforms/default/util.py b/cv/classification/resnet50/tensorflow/platforms/default/util.py new file mode 100644 index 0000000000000000000000000000000000000000..e64b9137fa6ccc5d12b07126dcf30265574eae41 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/platforms/default/util.py @@ -0,0 +1,90 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Utility code for the default platform.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import tempfile + +import cnn_util +from models import model_config + + +_ROOT_PROJECT_DIR = os.path.dirname(cnn_util.__file__) + + +def define_platform_params(): + """Defines platform-specific parameters. + + Currently there are no platform-specific parameters to be defined. + """ + pass + + +def get_cluster_manager(params, config_proto): + """Returns the cluster manager to be used.""" + return cnn_util.GrpcClusterManager(params, config_proto) + + +def get_command_to_run_python_module(module): + """Returns a command to run a Python module.""" + python_interpretter = sys.executable + if not python_interpretter: + raise ValueError('Could not find Python interpreter') + return [python_interpretter, + os.path.join(_ROOT_PROJECT_DIR, module + '.py')] + + +def get_test_output_dir(): + """Returns a directory where test outputs should be placed.""" + base_dir = os.environ.get('TEST_OUTPUTS_DIR', + '/tmp/tf_cnn_benchmarks_test_outputs') + if not os.path.exists(base_dir): + os.mkdir(base_dir) + return tempfile.mkdtemp(dir=base_dir) + + +def get_test_data_dir(): + """Returns the path to the test_data directory.""" + return os.path.join(_ROOT_PROJECT_DIR, 'test_data') + + +def get_ssd_backborn_model_file(): + raise NotImplementedError + + +def get_ssd_backboard_data_dir(): + raise NotImplementedError + + +def _initialize(params, config_proto): + del params, config_proto + model_config.register_tf1_models() + + +_is_initalized = False + + +def initialize(params, config_proto): + global _is_initalized + if _is_initalized: + return + _is_initalized = True + _initialize(params, config_proto) diff --git a/cv/classification/resnet50/tensorflow/platforms/util.py b/cv/classification/resnet50/tensorflow/platforms/util.py new file mode 100644 index 0000000000000000000000000000000000000000..9d569691bdec804080d62d11f8a200cd1ec2f2a9 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/platforms/util.py @@ -0,0 +1,30 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Utility code for a certain platform. + +This file simply imports everything from the default platform. To switch to a +different platform, the import statement can be changed to point to a new +platform. + +Creating a custom platform can be useful to, e.g., run some initialization code +required by the platform or register a platform-specific model. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from platforms.default.util import * # pylint: disable=unused-import,wildcard-import diff --git a/cv/classification/resnet50/tensorflow/preprocessing.py b/cv/classification/resnet50/tensorflow/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..43cca8c2adc810150c726f07994c0042f3f4b7f4 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/preprocessing.py @@ -0,0 +1,1336 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Image pre-processing utilities. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow.compat.v1 as tf + +# pylint: disable=g-direct-tensorflow-import +import cnn_util +try: + from tensorflow.python.data.experimental.ops import threadpool +except: + threadpool = None +from tensorflow.python.data.ops import multi_device_iterator_ops +from tensorflow.python.framework import function +from tensorflow.python.layers import utils +from tensorflow.python.ops import data_flow_ops +from tensorflow.python.platform import gfile +import mlperf +import numpy as np + +tf.random.set_random_seed(42) +np.random.seed(42) + +def parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + + The output of the build_image_data.py image preprocessing script is a dataset + containing serialized Example protocol buffers. Each Example proto contains + the following fields: + + image/height: 462 + image/width: 581 + image/colorspace: 'RGB' + image/channels: 3 + image/class/label: 615 + image/class/synset: 'n03623198' + image/class/text: 'knee pad' + image/object/bbox/xmin: 0.1 + image/object/bbox/xmax: 0.9 + image/object/bbox/ymin: 0.2 + image/object/bbox/ymax: 0.6 + image/object/bbox/label: 615 + image/format: 'JPEG' + image/filename: 'ILSVRC2012_val_00041207.JPEG' + image/encoded: + + Args: + example_serialized: scalar Tensor tf.string containing a serialized + Example protocol buffer. + + Returns: + image_buffer: Tensor tf.string containing the contents of a JPEG file. + label: Tensor tf.int32 containing the label. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + text: Tensor tf.string containing the human-readable label. + """ + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, + default_value=-1), + 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, + default_value=''), + } + sparse_float32 = tf.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + {k: sparse_float32 for k in ['image/object/bbox/xmin', + 'image/object/bbox/ymin', + 'image/object/bbox/xmax', + 'image/object/bbox/ymax']}) + + features = tf.parse_single_example(example_serialized, feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + + xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) + ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) + xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) + ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) + + # Note that we impose an ordering of (y, x) just to make life difficult. + bbox = tf.concat([ymin, xmin, ymax, xmax], 0) + + # Force the variable number of bounding boxes into the shape + # [1, num_boxes, coords]. + bbox = tf.expand_dims(bbox, 0) + bbox = tf.transpose(bbox, [0, 2, 1]) + + return features['image/encoded'], label, bbox, features['image/class/text'] + + +_RESIZE_METHOD_MAP = { + 'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR, + 'bilinear': tf.image.ResizeMethod.BILINEAR, + 'bicubic': tf.image.ResizeMethod.BICUBIC, + 'area': tf.image.ResizeMethod.AREA +} + + +def get_image_resize_method(resize_method, batch_position=0): + """Get tensorflow resize method. + + If resize_method is 'round_robin', return different methods based on batch + position in a round-robin fashion. NOTE: If the batch size is not a multiple + of the number of methods, then the distribution of methods will not be + uniform. + + Args: + resize_method: (string) nearest, bilinear, bicubic, area, or round_robin. + batch_position: position of the image in a batch. NOTE: this argument can + be an integer or a tensor + Returns: + one of resize type defined in tf.image.ResizeMethod. + """ + + if resize_method != 'round_robin': + return _RESIZE_METHOD_MAP[resize_method] + + # return a resize method based on batch position in a round-robin fashion. + resize_methods = list(_RESIZE_METHOD_MAP.values()) + def lookup(index): + return resize_methods[index] + + def resize_method_0(): + return utils.smart_cond(batch_position % len(resize_methods) == 0, + lambda: lookup(0), resize_method_1) + + def resize_method_1(): + return utils.smart_cond(batch_position % len(resize_methods) == 1, + lambda: lookup(1), resize_method_2) + + def resize_method_2(): + return utils.smart_cond(batch_position % len(resize_methods) == 2, + lambda: lookup(2), lambda: lookup(3)) + + # NOTE(jsimsa): Unfortunately, we cannot use a single recursive function here + # because TF would not be able to construct a finite graph. + + return resize_method_0() + + +def decode_jpeg(image_buffer, scope=None): # , dtype=tf.float32): + """Decode a JPEG string into one 3-D float image Tensor. + + Args: + image_buffer: scalar string Tensor. + scope: Optional scope for op_scope. + Returns: + 3-D float Tensor with values ranging from [0, 1). + """ + # with tf.op_scope([image_buffer], scope, 'decode_jpeg'): + # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]): + with tf.name_scope(scope or 'decode_jpeg'): + # Decode the string as an RGB JPEG. + # Note that the resulting image contains an unknown height and width + # that is set dynamically by decode_jpeg. In other words, the height + # and width of image is unknown at compile-time. + image = tf.image.decode_jpeg(image_buffer, channels=3, + fancy_upscaling=False, + dct_method='INTEGER_FAST') + + # image = tf.Print(image, [tf.shape(image)], 'Image shape: ') + + return image + + +_R_MEAN = 123.68 +_G_MEAN = 116.78 +_B_MEAN = 103.94 +_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN] + + +def normalized_image(images): + # Rescale from [0, 255] to [0, 2] + images = tf.multiply(images, 1. / 127.5) + # Rescale to [-1, 1] + mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION, value=[1.0] * 3) + return tf.subtract(images, 1.0) + + +def eval_image(image, + height, + width, + batch_position, + resize_method, + summary_verbosity=0): + """Get the image for model evaluation. + + We preprocess the image simiarly to Slim, see + https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/vgg_preprocessing.py + Validation images do not have bounding boxes, so to crop the image, we first + resize the image such that the aspect ratio is maintained and the resized + height and width are both at least 1.145 times `height` and `width` + respectively. Then, we do a central crop to size (`height`, `width`). + + Args: + image: 3-D float Tensor representing the image. + height: The height of the image that will be returned. + width: The width of the image that will be returned. + batch_position: position of the image in a batch, which affects how images + are distorted and resized. NOTE: this argument can be an integer or a + tensor + resize_method: one of the strings 'round_robin', 'nearest', 'bilinear', + 'bicubic', or 'area'. + summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both + summaries and checkpoints. + Returns: + An image of size (output_height, output_width, 3) that is resized and + cropped as described above. + """ + # TODO(reedwm): Currently we resize then crop. Investigate if it's faster to + # crop then resize. + with tf.name_scope('eval_image'): + if summary_verbosity >= 3: + tf.summary.image( + 'original_image', tf.expand_dims(image, 0)) + + shape = tf.shape(image) + image_height = shape[0] + image_width = shape[1] + image_height_float = tf.cast(image_height, tf.float32) + image_width_float = tf.cast(image_width, tf.float32) + + # This value is chosen so that in resnet, images are cropped to a size of + # 256 x 256, which matches what other implementations do. The final image + # size for resnet is 224 x 224, and floor(224 * 1.145) = 256. + scale_factor = 1.145 + + # Compute resize_height and resize_width to be the minimum values such that + # 1. The aspect ratio is maintained (i.e. resize_height / resize_width is + # image_height / image_width), and + # 2. resize_height >= height * `scale_factor`, and + # 3. resize_width >= width * `scale_factor` + max_ratio = tf.maximum(height / image_height_float, + width / image_width_float) + resize_height = tf.cast(image_height_float * max_ratio * scale_factor, + tf.int32) + resize_width = tf.cast(image_width_float * max_ratio * scale_factor, + tf.int32) + mlperf.logger.log_input_resize_aspect_preserving(height, width, + scale_factor) + + # Resize the image to shape (`resize_height`, `resize_width`) + image_resize_method = get_image_resize_method(resize_method, batch_position) + distorted_image = tf.image.resize_images(image, + [resize_height, resize_width], + image_resize_method, + align_corners=False) + + # Do a central crop of the image to size (height, width). + # MLPerf requires us to log (height, width) with two different keys. + mlperf.logger.log(key=mlperf.tags.INPUT_CENTRAL_CROP, value=[height, width]) + mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width]) + total_crop_height = (resize_height - height) + crop_top = total_crop_height // 2 + total_crop_width = (resize_width - width) + crop_left = total_crop_width // 2 + distorted_image = tf.slice(distorted_image, [crop_top, crop_left, 0], + [height, width, 3]) + + distorted_image.set_shape([height, width, 3]) + if summary_verbosity >= 3: + tf.summary.image( + 'cropped_resized_image', tf.expand_dims(distorted_image, 0)) + image = distorted_image + return image + + +def train_image(image_buffer, + height, + width, + bbox, + batch_position, + resize_method, + distortions, + scope=None, + summary_verbosity=0, + distort_color_in_yiq=False, + fuse_decode_and_crop=False): + """Distort one image for training a network. + + Distorting images provides a useful technique for augmenting the data + set during training in order to make the network invariant to aspects + of the image that do not effect the label. + + Args: + image_buffer: scalar string Tensor representing the raw JPEG image buffer. + height: integer + width: integer + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged + as [ymin, xmin, ymax, xmax]. + batch_position: position of the image in a batch, which affects how images + are distorted and resized. NOTE: this argument can be an integer or a + tensor + resize_method: round_robin, nearest, bilinear, bicubic, or area. + distortions: If true, apply full distortions for image colors. + scope: Optional scope for op_scope. + summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both + summaries and checkpoints. + distort_color_in_yiq: distort color of input images in YIQ space. + fuse_decode_and_crop: fuse the decode/crop operation. + Returns: + 3-D float Tensor of distorted image used for training. + """ + # with tf.op_scope([image, height, width, bbox], scope, 'distort_image'): + # with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]): + with tf.name_scope(scope or 'distort_image'): + # A large fraction of image datasets contain a human-annotated bounding box + # delineating the region of the image containing the object of interest. We + # choose to create a new bounding box for the object which is a randomly + # distorted version of the human-annotated bounding box that obeys an + # allowed range of aspect ratios, sizes and overlap with the human-annotated + # bounding box. If no box is supplied, then we assume the bounding box is + # the entire image. + min_object_covered = 0.1 + aspect_ratio_range = [0.75, 1.33] + area_range = [0.05, 1.0] + max_attempts = 100 + mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MIN_OBJ_COV, + value=min_object_covered) + mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_RATIO_RANGE, + value=aspect_ratio_range) + mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_AREA_RANGE, + value=area_range) + mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MAX_ATTEMPTS, + value=max_attempts) + + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + tf.image.extract_jpeg_shape(image_buffer), + bounding_boxes=bbox, + min_object_covered=min_object_covered, + aspect_ratio_range=aspect_ratio_range, + area_range=area_range, + max_attempts=max_attempts, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box + if summary_verbosity >= 3: + image = tf.image.decode_jpeg(image_buffer, channels=3, + dct_method='INTEGER_FAST') + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + image_with_distorted_box = tf.image.draw_bounding_boxes( + tf.expand_dims(image, 0), distort_bbox) + tf.summary.image( + 'images_with_distorted_bounding_box', + image_with_distorted_box) + + # Crop the image to the specified bounding box. + if fuse_decode_and_crop: + offset_y, offset_x, _ = tf.unstack(bbox_begin) + target_height, target_width, _ = tf.unstack(bbox_size) + crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) + image = tf.image.decode_and_crop_jpeg( + image_buffer, crop_window, channels=3) + else: + image = tf.image.decode_jpeg(image_buffer, channels=3, + dct_method='INTEGER_FAST') + image = tf.slice(image, bbox_begin, bbox_size) + + mlperf.logger.log(key=mlperf.tags.INPUT_RANDOM_FLIP) + distorted_image = tf.image.random_flip_left_right(image) + + # This resizing operation may distort the images because the aspect + # ratio is not respected. + mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width]) + image_resize_method = get_image_resize_method(resize_method, batch_position) + distorted_image = tf.image.resize_images( + distorted_image, [height, width], + image_resize_method, + align_corners=False) + # Restore the shape since the dynamic slice based upon the bbox_size loses + # the third dimension. + distorted_image.set_shape([height, width, 3]) + if summary_verbosity >= 3: + tf.summary.image('cropped_resized_maybe_flipped_image', + tf.expand_dims(distorted_image, 0)) + + if distortions: + distorted_image = tf.cast(distorted_image, dtype=tf.float32) + # Images values are expected to be in [0,1] for color distortion. + distorted_image /= 255. + # Randomly distort the colors. + distorted_image = distort_color(distorted_image, batch_position, + distort_color_in_yiq=distort_color_in_yiq) + + # Note: This ensures the scaling matches the output of eval_image + distorted_image *= 255 + + if summary_verbosity >= 3: + tf.summary.image( + 'final_distorted_image', + tf.expand_dims(distorted_image, 0)) + return distorted_image + + +def distort_color(image, batch_position=0, distort_color_in_yiq=False, + scope=None): + """Distort the color of the image. + + Each color distortion is non-commutative and thus ordering of the color ops + matters. Ideally we would randomly permute the ordering of the color ops. + Rather then adding that level of complication, we select a distinct ordering + of color ops based on the position of the image in a batch. + + Args: + image: float32 Tensor containing single image. Tensor values should be in + range [0, 1]. + batch_position: the position of the image in a batch. NOTE: this argument + can be an integer or a tensor + distort_color_in_yiq: distort color of input images in YIQ space. + scope: Optional scope for op_scope. + Returns: + color-distorted image + """ + if distort_color_in_yiq: + try: + from tensorflow.contrib.image.python.ops import distort_image_ops # pylint: disable=g-import-not-at-top + except ImportError: + raise ValueError( + 'In TF2, you cannot pass --distortions unless you also pass ' + '--nodistort_color_in_yiq. This is because the random_hsv_in_yiq was ' + 'removed in TF2. --distortions does not improve accuracy on resnet ' + 'so it is not recommended. --nodistort_color_in_yiq also has no ' + 'impact on accuracy, but may hurt performance.') + + with tf.name_scope(scope or 'distort_color'): + + def distort_fn_0(image=image): + """Variant 0 of distort function.""" + image = tf.image.random_brightness(image, max_delta=32. / 255.) + if distort_color_in_yiq: + image = distort_image_ops.random_hsv_in_yiq( + image, lower_saturation=0.5, upper_saturation=1.5, + max_delta_hue=0.2 * math.pi) + else: + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + return image + + def distort_fn_1(image=image): + """Variant 1 of distort function.""" + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + if distort_color_in_yiq: + image = distort_image_ops.random_hsv_in_yiq( + image, lower_saturation=0.5, upper_saturation=1.5, + max_delta_hue=0.2 * math.pi) + else: + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + return image + + image = utils.smart_cond(batch_position % 2 == 0, distort_fn_0, + distort_fn_1) + # The random_* ops do not necessarily clamp. + image = tf.clip_by_value(image, 0.0, 1.0) + return image + + +class InputPreprocessor(object): + """Base class for all model preprocessors.""" + + def __init__(self, batch_size, output_shapes): + self.batch_size = batch_size + self.output_shapes = output_shapes + + def supports_datasets(self): + """Whether this preprocessor supports dataset.""" + return False + + def minibatch(self, dataset, subset, params, shift_ratio=-1): + """Returns tensors representing a minibatch of all the input.""" + raise NotImplementedError('Must be implemented by subclass.') + + # The methods added below are only supported/used if supports_datasets() + # returns True. + # TODO(laigd): refactor benchmark_cnn.py and put the logic of + # _build_input_processing() into InputPreprocessor. + + def parse_and_preprocess(self, value, batch_position): + """Function to parse and preprocess an Example proto in input pipeline.""" + raise NotImplementedError('Must be implemented by subclass.') + + # TODO(laigd): figure out how to remove these parameters, since the + # preprocessor itself has self.batch_size, self.num_splits, etc defined. + def build_multi_device_iterator(self, batch_size, num_splits, cpu_device, + params, gpu_devices, dataset, doing_eval): + """Creates a MultiDeviceIterator.""" + assert self.supports_datasets() + assert num_splits == len(gpu_devices) + with tf.name_scope('batch_processing'): + if doing_eval: + subset = 'validation' + else: + subset = 'train' + batch_size_per_split = batch_size // num_splits + ds = self.create_dataset( + batch_size, + num_splits, + batch_size_per_split, + dataset, + subset, + train=(not doing_eval), + datasets_repeat_cached_sample=params.datasets_repeat_cached_sample, + num_threads=params.datasets_num_private_threads, + datasets_use_caching=params.datasets_use_caching, + datasets_parallel_interleave_cycle_length=( + params.datasets_parallel_interleave_cycle_length), + datasets_sloppy_parallel_interleave=( + params.datasets_sloppy_parallel_interleave), + datasets_parallel_interleave_prefetch=( + params.datasets_parallel_interleave_prefetch)) + multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator( + ds, + gpu_devices, + source_device=cpu_device, + max_buffer_size=params.multi_device_iterator_max_buffer_size) + tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, + multi_device_iterator.initializer) + return multi_device_iterator + + def create_dataset(self, + batch_size, + num_splits, + batch_size_per_split, + dataset, + subset, + train, + datasets_repeat_cached_sample, + num_threads=None, + datasets_use_caching=False, + datasets_parallel_interleave_cycle_length=None, + datasets_sloppy_parallel_interleave=False, + datasets_parallel_interleave_prefetch=None): + """Creates a dataset for the benchmark.""" + raise NotImplementedError('Must be implemented by subclass.') + + def create_iterator(self, ds): + ds_iterator = tf.data.make_initializable_iterator(ds) + tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, + ds_iterator.initializer) + return ds_iterator + + def minibatch_fn(self, batch_size, model_input_shapes, num_splits, + dataset, subset, train, datasets_repeat_cached_sample, + num_threads, datasets_use_caching, + datasets_parallel_interleave_cycle_length, + datasets_sloppy_parallel_interleave, + datasets_parallel_interleave_prefetch): + """Returns a function and list of args for the fn to create a minibatch.""" + assert self.supports_datasets() + batch_size_per_split = batch_size // num_splits + assert batch_size_per_split == model_input_shapes[0][0] + with tf.name_scope('batch_processing'): + ds = self.create_dataset(batch_size, num_splits, batch_size_per_split, + dataset, subset, train, + datasets_repeat_cached_sample, num_threads, + datasets_use_caching, + datasets_parallel_interleave_cycle_length, + datasets_sloppy_parallel_interleave, + datasets_parallel_interleave_prefetch) + ds_iterator = self.create_iterator(ds) + + ds_iterator_string_handle = ds_iterator.string_handle() + + @function.Defun(tf.string) + def _fn(h): + remote_iterator = tf.data.Iterator.from_string_handle( + h, ds_iterator.output_types, ds_iterator.output_shapes) + input_list = remote_iterator.get_next() + reshaped_input_list = [ + tf.reshape(input_list[i], shape=model_input_shapes[i]) + for i in range(len(input_list)) + ] + return reshaped_input_list + + return _fn, [ds_iterator_string_handle] + + +class BaseImagePreprocessor(InputPreprocessor): + """Base class for all image model preprocessors.""" + + def __init__(self, + batch_size, + output_shapes, + num_splits, + dtype, + train, + distortions, + resize_method, + shift_ratio=-1, + summary_verbosity=0, + distort_color_in_yiq=True, + fuse_decode_and_crop=True, + match_mlperf=False): + super(BaseImagePreprocessor, self).__init__(batch_size, output_shapes) + image_shape = output_shapes[0] + # image_shape is in form (batch_size, height, width, depth) + self.height = image_shape[1] + self.width = image_shape[2] + self.depth = image_shape[3] + self.num_splits = num_splits + self.dtype = dtype + self.train = train + self.resize_method = resize_method + self.shift_ratio = shift_ratio + self.distortions = distortions + self.distort_color_in_yiq = distort_color_in_yiq + self.fuse_decode_and_crop = fuse_decode_and_crop + if self.batch_size % self.num_splits != 0: + raise ValueError( + ('batch_size must be a multiple of num_splits: ' + 'batch_size %d, num_splits: %d') % + (self.batch_size, self.num_splits)) + self.batch_size_per_split = self.batch_size // self.num_splits + self.summary_verbosity = summary_verbosity + self.match_mlperf = match_mlperf + + def parse_and_preprocess(self, value, batch_position): + assert self.supports_datasets() + image_buffer, label_index, bbox, _ = parse_example_proto(value) + if self.match_mlperf: + bbox = tf.zeros((1, 0, 4), dtype=bbox.dtype) + mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=False) + else: + mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=True) + image = self.preprocess(image_buffer, bbox, batch_position) + return (image, label_index) + + def preprocess(self, image_buffer, bbox, batch_position): + raise NotImplementedError('Must be implemented by subclass.') + + def create_dataset(self, + batch_size, + num_splits, + batch_size_per_split, + dataset, + subset, + train, + datasets_repeat_cached_sample, + num_threads=None, + datasets_use_caching=False, + datasets_parallel_interleave_cycle_length=None, + datasets_sloppy_parallel_interleave=False, + datasets_parallel_interleave_prefetch=None): + """Creates a dataset for the benchmark.""" + assert self.supports_datasets() + glob_pattern = dataset.tf_record_pattern(subset) + file_names = gfile.Glob(glob_pattern) + if not file_names: + raise ValueError('Found no files in --data_dir matching: {}' + .format(glob_pattern)) + ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train) + ds = ds.apply( + tf.data.experimental.parallel_interleave( + tf.data.TFRecordDataset, + cycle_length=datasets_parallel_interleave_cycle_length or 10, + sloppy=datasets_sloppy_parallel_interleave, + prefetch_input_elements=datasets_parallel_interleave_prefetch)) + if datasets_repeat_cached_sample: + # Repeat a single sample element indefinitely to emulate memory-speed IO. + ds = ds.take(1).cache().repeat() + counter = tf.data.Dataset.range(batch_size) + counter = counter.repeat() + ds = tf.data.Dataset.zip((ds, counter)) + ds = ds.prefetch(buffer_size=batch_size) + if datasets_use_caching: + ds = ds.cache() + if train: + buffer_size = 10000 + mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=buffer_size) + ds = ds.apply( + tf.data.experimental.shuffle_and_repeat(buffer_size=buffer_size)) + else: + ds = ds.repeat() + ds = ds.apply( + tf.data.experimental.map_and_batch( + map_func=self.parse_and_preprocess, + batch_size=batch_size_per_split, + num_parallel_batches=num_splits)) + ds = ds.prefetch(buffer_size=num_splits) + if num_threads and threadpool is not None: + ds = threadpool.override_threadpool( + ds, + threadpool.PrivateThreadPool( + num_threads, display_name='input_pipeline_thread_pool')) + return ds + + +class RecordInputImagePreprocessor(BaseImagePreprocessor): + """Preprocessor for images with RecordInput format.""" + + def preprocess(self, image_buffer, bbox, batch_position): + """Preprocessing image_buffer as a function of its batch position.""" + if self.train: + image = train_image(image_buffer, self.height, self.width, bbox, + batch_position, self.resize_method, self.distortions, + None, summary_verbosity=self.summary_verbosity, + distort_color_in_yiq=self.distort_color_in_yiq, + fuse_decode_and_crop=self.fuse_decode_and_crop) + else: + image = tf.image.decode_jpeg( + image_buffer, channels=3, dct_method='INTEGER_FAST') + image = eval_image(image, self.height, self.width, batch_position, + self.resize_method, + summary_verbosity=self.summary_verbosity) + # Note: image is now float32 [height,width,3] with range [0, 255] + + # image = tf.cast(image, tf.uint8) # HACK TESTING + + if self.match_mlperf: + mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION, + value=_CHANNEL_MEANS) + normalized = image - _CHANNEL_MEANS + else: + normalized = normalized_image(image) + return tf.cast(normalized, self.dtype) + + def minibatch(self, + dataset, + subset, + params, + shift_ratio=-1): + if shift_ratio < 0: + shift_ratio = self.shift_ratio + with tf.name_scope('batch_processing'): + # Build final results per split. + images = [[] for _ in range(self.num_splits)] + labels = [[] for _ in range(self.num_splits)] + if params.use_datasets: + ds = self.create_dataset( + self.batch_size, self.num_splits, self.batch_size_per_split, + dataset, subset, self.train, + datasets_repeat_cached_sample=params.datasets_repeat_cached_sample, + num_threads=params.datasets_num_private_threads, + datasets_use_caching=params.datasets_use_caching, + datasets_parallel_interleave_cycle_length=( + params.datasets_parallel_interleave_cycle_length), + datasets_sloppy_parallel_interleave=( + params.datasets_sloppy_parallel_interleave), + datasets_parallel_interleave_prefetch=( + params.datasets_parallel_interleave_prefetch)) + ds_iterator = self.create_iterator(ds) + for d in xrange(self.num_splits): + images[d], labels[d] = ds_iterator.get_next() + + # TODO(laigd): consider removing the --use_datasets option, it should + # always use datasets. + else: + record_input = data_flow_ops.RecordInput( + file_pattern=dataset.tf_record_pattern(subset), + seed=301, + parallelism=64, + buffer_size=10000, + batch_size=self.batch_size, + shift_ratio=shift_ratio, + name='record_input') + records = record_input.get_yield_op() + records = tf.split(records, self.batch_size, 0) + records = [tf.reshape(record, []) for record in records] + for idx in xrange(self.batch_size): + value = records[idx] + (image, label) = self.parse_and_preprocess(value, idx) + split_index = idx % self.num_splits + labels[split_index].append(label) + images[split_index].append(image) + + for split_index in xrange(self.num_splits): + if not params.use_datasets: + images[split_index] = tf.parallel_stack(images[split_index]) + labels[split_index] = tf.concat(labels[split_index], 0) + images[split_index] = tf.reshape( + images[split_index], + shape=[self.batch_size_per_split, self.height, self.width, + self.depth]) + labels[split_index] = tf.reshape(labels[split_index], + [self.batch_size_per_split]) + return images, labels + + def supports_datasets(self): + return True + + +class ImagenetPreprocessor(RecordInputImagePreprocessor): + + def preprocess(self, image_buffer, bbox, batch_position): + # pylint: disable=g-import-not-at-top + try: + from official.r1.resnet.imagenet_preprocessing import preprocess_image + except ImportError: + tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.') + raise + if self.train: + image = preprocess_image( + image_buffer, bbox, self.height, self.width, self.depth, + is_training=True) + else: + image = preprocess_image( + image_buffer, bbox, self.height, self.width, self.depth, + is_training=False) + return tf.cast(image, self.dtype) + + +class Cifar10ImagePreprocessor(BaseImagePreprocessor): + """Preprocessor for Cifar10 input images.""" + + def _distort_image(self, image): + """Distort one image for training a network. + + Adopted the standard data augmentation scheme that is widely used for + this dataset: the images are first zero-padded with 4 pixels on each side, + then randomly cropped to again produce distorted images; half of the images + are then horizontally mirrored. + + Args: + image: input image. + Returns: + distorted image. + """ + image = tf.image.resize_image_with_crop_or_pad( + image, self.height + 8, self.width + 8) + distorted_image = tf.random_crop(image, + [self.height, self.width, self.depth]) + # Randomly flip the image horizontally. + distorted_image = tf.image.random_flip_left_right(distorted_image) + if self.summary_verbosity >= 3: + tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0)) + return distorted_image + + def _eval_image(self, image): + """Get the image for model evaluation.""" + distorted_image = tf.image.resize_image_with_crop_or_pad( + image, self.width, self.height) + if self.summary_verbosity >= 3: + tf.summary.image('cropped.image', tf.expand_dims(distorted_image, 0)) + return distorted_image + + def preprocess(self, raw_image): + """Preprocessing raw image.""" + if self.summary_verbosity >= 3: + tf.summary.image('raw.image', tf.expand_dims(raw_image, 0)) + if self.train and self.distortions: + image = self._distort_image(raw_image) + else: + image = self._eval_image(raw_image) + normalized = normalized_image(image) + return tf.cast(normalized, self.dtype) + + def minibatch(self, + dataset, + subset, + params, + shift_ratio=-1): + # TODO(jsimsa): Implement datasets code path + del shift_ratio, params + with tf.name_scope('batch_processing'): + all_images, all_labels = dataset.read_data_files(subset) + all_images = tf.constant(all_images) + all_labels = tf.constant(all_labels) + input_image, input_label = tf.train.slice_input_producer( + [all_images, all_labels]) + input_image = tf.cast(input_image, self.dtype) + input_label = tf.cast(input_label, tf.int32) + # Ensure that the random shuffling has good mixing properties. + min_fraction_of_examples_in_queue = 0.4 + min_queue_examples = int(dataset.num_examples_per_epoch(subset) * + min_fraction_of_examples_in_queue) + raw_images, raw_labels = tf.train.shuffle_batch( + [input_image, input_label], batch_size=self.batch_size, + capacity=min_queue_examples + 3 * self.batch_size, + min_after_dequeue=min_queue_examples) + + images = [[] for i in range(self.num_splits)] + labels = [[] for i in range(self.num_splits)] + + # Create a list of size batch_size, each containing one image of the + # batch. Without the unstack call, raw_images[i] would still access the + # same image via a strided_slice op, but would be slower. + raw_images = tf.unstack(raw_images, axis=0) + raw_labels = tf.unstack(raw_labels, axis=0) + for i in xrange(self.batch_size): + split_index = i % self.num_splits + # The raw image read from data has the format [depth, height, width] + # reshape to the format returned by minibatch. + raw_image = tf.reshape(raw_images[i], + [dataset.depth, dataset.height, dataset.width]) + raw_image = tf.transpose(raw_image, [1, 2, 0]) + image = self.preprocess(raw_image) + images[split_index].append(image) + + labels[split_index].append(raw_labels[i]) + + for split_index in xrange(self.num_splits): + images[split_index] = tf.parallel_stack(images[split_index]) + labels[split_index] = tf.parallel_stack(labels[split_index]) + return images, labels + + +class COCOPreprocessor(BaseImagePreprocessor): + """Preprocessor for COCO dataset input images, boxes, and labels.""" + + def minibatch(self, + dataset, + subset, + params, + shift_ratio=-1): + del shift_ratio # Not used when using datasets instead of data_flow_ops + with tf.name_scope('batch_processing'): + ds = self.create_dataset( + self.batch_size, self.num_splits, self.batch_size_per_split, + dataset, subset, self.train, params.datasets_repeat_cached_sample) + ds_iterator = self.create_iterator(ds) + + # Training data: 4 tuple + # Validation data: 5 tuple + # See get_input_shapes in models/ssd_model.py for details. + input_len = 4 if subset == 'train' else 5 + input_lists = [[None for _ in range(self.num_splits)] + for _ in range(input_len)] + for d in xrange(self.num_splits): + input_list = ds_iterator.get_next() + for i in range(input_len): + input_lists[i][d] = input_list[i] + return input_lists + + def preprocess(self, data): + try: + import ssd_dataloader # pylint: disable=g-import-not-at-top + import ssd_constants # pylint: disable=g-import-not-at-top + from object_detection.core import preprocessor # pylint: disable=g-import-not-at-top + except ImportError: + raise ImportError('To use the COCO dataset, you must clone the ' + 'repo https://github.com/tensorflow/models and add ' + 'tensorflow/models and tensorflow/models/research to ' + 'the PYTHONPATH, and compile the protobufs by ' + 'following https://github.com/tensorflow/models/blob/' + 'master/research/object_detection/g3doc/installation.md' + '#protobuf-compilation') + image_buffer = data['image_buffer'] + boxes = data['groundtruth_boxes'] + classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) + source_id = tf.string_to_number(data['source_id']) + raw_shape = data['raw_shape'] + + ssd_encoder = ssd_dataloader.Encoder() + + # Only 80 of the 90 COCO classes are used. + class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) + classes = tf.gather(class_map, classes) + classes = tf.cast(classes, dtype=tf.float32) + + if self.train: + image, boxes, classes = ssd_dataloader.ssd_decode_and_crop( + image_buffer, boxes, classes, raw_shape) + # ssd_crop resizes and returns image of dtype float32 and does not change + # its range (i.e., value in between 0--255). Divide by 255. converts it + # to [0, 1] range. Not doing this before cropping to avoid dtype cast + # (which incurs additional memory copy). + image /= 255. + + image, boxes = preprocessor.random_horizontal_flip( + image=image, boxes=boxes) + # Random horizontal flip probability is 50% + # See https://github.com/tensorflow/models/blob/master/research/object_detection/core/preprocessor.py # pylint: disable=line-too-long + mlperf.logger.log(key=mlperf.tags.RANDOM_FLIP_PROBABILITY, value=0.5) + + image = tf.cast(image, self.dtype) + + encoded_returns = ssd_encoder.encode_labels(boxes, classes) + encoded_classes, encoded_boxes, num_matched_boxes = encoded_returns + + # Shape of image: [width, height, channel] + # Shape of encoded_boxes: [NUM_SSD_BOXES, 4] + # Shape of encoded_classes: [NUM_SSD_BOXES, 1] + # Shape of num_matched_boxes: [1] + return (image, encoded_boxes, encoded_classes, num_matched_boxes) + + else: + image = tf.image.decode_jpeg(image_buffer) + image = tf.image.resize_images( + image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE)) + # resize_image returns image of dtype float32 and does not change its + # range. Divide by 255 to convert image to [0, 1] range. + image /= 255. + + image = ssd_dataloader.normalize_image(image) + image = tf.cast(image, self.dtype) + + def trim_and_pad(inp_tensor): + """Limit the number of boxes, and pad if necessary.""" + inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES] + num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0] + inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) + return tf.reshape(inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, + inp_tensor.get_shape()[1]]) + + boxes, classes = trim_and_pad(boxes), trim_and_pad(classes) + + # Shape of boxes: [MAX_NUM_EVAL_BOXES, 4] + # Shape of classes: [MAX_NUM_EVAL_BOXES, 1] + # Shape of source_id: [] (scalar tensor) + # Shape of raw_shape: [3] + return (image, boxes, classes, source_id, raw_shape) + + def create_dataset(self, + batch_size, + num_splits, + batch_size_per_split, + dataset, + subset, + train, + datasets_repeat_cached_sample, + num_threads=None, + datasets_use_caching=False, + datasets_parallel_interleave_cycle_length=None, + datasets_sloppy_parallel_interleave=False, + datasets_parallel_interleave_prefetch=None): + """Creates a dataset for the benchmark.""" + try: + import ssd_dataloader # pylint: disable=g-import-not-at-top + except ImportError: + raise ImportError('To use the COCO dataset, you must clone the ' + 'repo https://github.com/tensorflow/models and add ' + 'tensorflow/models and tensorflow/models/research to ' + 'the PYTHONPATH, and compile the protobufs by ' + 'following https://github.com/tensorflow/models/blob/' + 'master/research/object_detection/g3doc/installation.md' + '#protobuf-compilation') + assert self.supports_datasets() + + glob_pattern = dataset.tf_record_pattern(subset) + ds = tf.data.TFRecordDataset.list_files(glob_pattern, shuffle=train) + # TODO(haoyuzhang): Enable map+filter fusion after cl/218399112 in release + # options = tf.data.Options() + # options.experimental_optimization = tf.data.experimental.OptimizationOptions() # pylint: disable=line-too-long + # options.experimental_optimization.map_and_filter_fusion = True + # ds = ds.with_options(options) + + ds = ds.apply( + tf.data.experimental.parallel_interleave( + tf.data.TFRecordDataset, + cycle_length=datasets_parallel_interleave_cycle_length or 10, + sloppy=datasets_sloppy_parallel_interleave)) + mlperf.logger.log(key=mlperf.tags.INPUT_ORDER) + if datasets_repeat_cached_sample: + # Repeat a single sample element indefinitely to emulate memory-speed IO. + ds = ds.take(1).cache().repeat() + ds = ds.prefetch(buffer_size=batch_size) + if datasets_use_caching: + ds = ds.cache() + if train: + ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000)) + mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=10000) + mlperf.logger.log(key=mlperf.tags.INPUT_ORDER) + else: + ds = ds.repeat() + + ds = ds.map(ssd_dataloader.ssd_parse_example_proto, num_parallel_calls=64) + ds = ds.filter( + lambda data: tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0)) + ds = ds.apply( + tf.data.experimental.map_and_batch( + map_func=self.preprocess, + batch_size=batch_size_per_split, + num_parallel_batches=num_splits, + drop_remainder=train)) + ds = ds.prefetch(buffer_size=num_splits) + if num_threads: + ds = threadpool.override_threadpool( + ds, + threadpool.PrivateThreadPool( + num_threads, display_name='input_pipeline_thread_pool')) + return ds + + def supports_datasets(self): + return True + + +class TestImagePreprocessor(BaseImagePreprocessor): + """Preprocessor used for testing. + + set_fake_data() sets which images and labels will be output by minibatch(), + and must be called before minibatch(). This allows tests to easily specify + a set of images to use for training, without having to create any files. + + Queue runners must be started for this preprocessor to work. + """ + + def __init__(self, + batch_size, + output_shapes, + num_splits, + dtype, + train=None, + distortions=None, + resize_method=None, + shift_ratio=0, + summary_verbosity=0, + distort_color_in_yiq=False, + fuse_decode_and_crop=False, + match_mlperf=False): + super(TestImagePreprocessor, self).__init__( + batch_size, output_shapes, num_splits, dtype, train, distortions, + resize_method, shift_ratio, summary_verbosity=summary_verbosity, + distort_color_in_yiq=distort_color_in_yiq, + fuse_decode_and_crop=fuse_decode_and_crop, match_mlperf=match_mlperf) + self.expected_subset = None + + def set_fake_data(self, fake_images, fake_labels): + assert len(fake_images.shape) == 4 + assert len(fake_labels.shape) == 1 + num_images = fake_images.shape[0] + assert num_images == fake_labels.shape[0] + assert num_images % self.batch_size == 0 + self.fake_images = fake_images + self.fake_labels = fake_labels + + def minibatch(self, + dataset, + subset, + params, + shift_ratio=0): + """Get test image batches.""" + del dataset, params + if (not hasattr(self, 'fake_images') or + not hasattr(self, 'fake_labels')): + raise ValueError('Must call set_fake_data() before calling minibatch ' + 'on TestImagePreprocessor') + if self.expected_subset is not None: + assert subset == self.expected_subset + + shift_ratio = shift_ratio or self.shift_ratio + fake_images = cnn_util.roll_numpy_batches(self.fake_images, self.batch_size, + shift_ratio) + fake_labels = cnn_util.roll_numpy_batches(self.fake_labels, self.batch_size, + shift_ratio) + + with tf.name_scope('batch_processing'): + image_slice, label_slice = tf.train.slice_input_producer( + [fake_images, fake_labels], + shuffle=False, + name='image_slice') + raw_images, raw_labels = tf.train.batch( + [image_slice, label_slice], batch_size=self.batch_size, + name='image_batch') + images = [[] for _ in range(self.num_splits)] + labels = [[] for _ in range(self.num_splits)] + for i in xrange(self.batch_size): + split_index = i % self.num_splits + raw_image = tf.cast(raw_images[i], self.dtype) + images[split_index].append(raw_image) + labels[split_index].append(raw_labels[i]) + for split_index in xrange(self.num_splits): + images[split_index] = tf.parallel_stack(images[split_index]) + labels[split_index] = tf.parallel_stack(labels[split_index]) + + normalized = [normalized_image(part) for part in images] + return [[tf.cast(part, self.dtype) for part in normalized], labels] + + +class LibrispeechPreprocessor(InputPreprocessor): + """Preprocessor for librispeech class for all image model preprocessors.""" + + def __init__(self, batch_size, output_shapes, num_splits, dtype, train, + **kwargs): + del kwargs + super(LibrispeechPreprocessor, self).__init__(batch_size, output_shapes) + self.num_splits = num_splits + self.dtype = dtype + self.is_train = train + if self.batch_size % self.num_splits != 0: + raise ValueError(('batch_size must be a multiple of num_splits: ' + 'batch_size %d, num_splits: %d') % (self.batch_size, + self.num_splits)) + self.batch_size_per_split = self.batch_size // self.num_splits + + def create_dataset(self, + batch_size, + num_splits, + batch_size_per_split, + dataset, + subset, + train, + datasets_repeat_cached_sample, + num_threads=None, + datasets_use_caching=False, + datasets_parallel_interleave_cycle_length=None, + datasets_sloppy_parallel_interleave=False, + datasets_parallel_interleave_prefetch=None): + """Creates a dataset for the benchmark.""" + # TODO(laigd): currently the only difference between this and the one in + # BaseImagePreprocessor is, this uses map() and padded_batch() while the + # latter uses tf.data.experimental.map_and_batch(). Try to merge them. + assert self.supports_datasets() + glob_pattern = dataset.tf_record_pattern(subset) + file_names = gfile.Glob(glob_pattern) + if not file_names: + raise ValueError('Found no files in --data_dir matching: {}' + .format(glob_pattern)) + ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train) + ds = ds.apply( + tf.data.experimental.parallel_interleave( + tf.data.TFRecordDataset, + cycle_length=datasets_parallel_interleave_cycle_length or 10, + sloppy=datasets_sloppy_parallel_interleave, + prefetch_input_elements=datasets_parallel_interleave_prefetch)) + if datasets_repeat_cached_sample: + # Repeat a single sample element indefinitely to emulate memory-speed IO. + ds = ds.take(1).cache().repeat() + counter = tf.data.Dataset.range(batch_size) + counter = counter.repeat() + ds = tf.data.Dataset.zip((ds, counter)) + ds = ds.prefetch(buffer_size=batch_size) + if datasets_use_caching: + ds = ds.cache() + if train: + ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000)) + else: + ds = ds.repeat() + ds = ds.map(map_func=self.parse_and_preprocess, + num_parallel_calls=batch_size_per_split*num_splits) + ds = ds.padded_batch( + batch_size=batch_size_per_split, + padded_shapes=tuple([ + tf.TensorShape(output_shape[1:]) + for output_shape in self.output_shapes + ]), + drop_remainder=True) + ds = ds.prefetch(buffer_size=num_splits) + if num_threads: + ds = threadpool.override_threadpool( + ds, + threadpool.PrivateThreadPool( + num_threads, display_name='input_pipeline_thread_pool')) + return ds + + def minibatch(self, dataset, subset, params, shift_ratio=-1): + assert params.use_datasets + # TODO(laigd): unify this with CNNModel's minibatch() + # TODO(laigd): in distributed mode we use shift_ratio so different workers + # won't work on same inputs, so we should respect that. + del shift_ratio + with tf.name_scope('batch_processing'): + ds = self.create_dataset( + self.batch_size, + self.num_splits, + self.batch_size_per_split, + dataset, + subset, + self.is_train, + datasets_repeat_cached_sample=params.datasets_repeat_cached_sample, + num_threads=params.datasets_num_private_threads, + datasets_use_caching=params.datasets_use_caching, + datasets_parallel_interleave_cycle_length=( + params.datasets_parallel_interleave_cycle_length), + datasets_sloppy_parallel_interleave=( + params.datasets_sloppy_parallel_interleave), + datasets_parallel_interleave_prefetch=( + params.datasets_parallel_interleave_prefetch)) + ds_iterator = self.create_iterator(ds) + + # The four lists are: input spectrogram feature, labels, input lengths, + # label lengths + input_lists = [[None for _ in range(self.num_splits)] for _ in range(4)] + for d in xrange(self.num_splits): + input_list = ds_iterator.get_next() + for i in range(4): + input_lists[i][d] = input_list[i] + + assert self.output_shapes == [ + input_lists[i][0].shape.as_list() for i in range(4) + ] + return tuple(input_lists) + + def supports_datasets(self): + return True + + def parse_and_preprocess(self, value, batch_position): + """Parse an TFRecord.""" + del batch_position + assert self.supports_datasets() + context_features = { + 'labels': tf.VarLenFeature(dtype=tf.int64), + 'input_length': tf.FixedLenFeature([], dtype=tf.int64), + 'label_length': tf.FixedLenFeature([], dtype=tf.int64), + } + sequence_features = { + 'features': tf.FixedLenSequenceFeature([161], dtype=tf.float32) + } + context_parsed, sequence_parsed = tf.parse_single_sequence_example( + serialized=value, + context_features=context_features, + sequence_features=sequence_features, + ) + + return [ + # Input + tf.expand_dims(sequence_parsed['features'], axis=2), + # Label + tf.cast( + tf.reshape( + tf.sparse_tensor_to_dense(context_parsed['labels']), [-1]), + dtype=tf.int32), + # Input length + tf.cast( + tf.reshape(context_parsed['input_length'], [1]), + dtype=tf.int32), + # Label length + tf.cast( + tf.reshape(context_parsed['label_length'], [1]), + dtype=tf.int32), + ] diff --git a/cv/classification/resnet50/tensorflow/run_tests.py b/cv/classification/resnet50/tensorflow/run_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..5b3dcd3276c776a1a585181229fae19e691106e3 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/run_tests.py @@ -0,0 +1,107 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Runs the tf_cnn_benchmarks tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import unittest + +from absl import app +from absl import flags as absl_flags +import tensorflow.compat.v1 as tf + +import all_reduce_benchmark_test +import allreduce_test +import benchmark_cnn_distributed_test +import benchmark_cnn_test +import cnn_util_test +import variable_mgr_util_test +from models import model_config + +# Ideally, we wouldn't need this option, and run both distributed tests and non- +# distributed tests. But, TensorFlow allocates all the GPU memory by default, so +# the non-distributed tests allocate all the GPU memory. The distributed tests +# spawn processes that run TensorFlow, and cannot run if all the GPU memory is +# already allocated. If a non-distributed test is run, then a distributed test +# is run in the same process, the distributed test will fail because there is no +# more GPU memory for the spawned processes to allocate. +absl_flags.DEFINE_boolean('run_distributed_tests', False, + 'If True, run the distributed tests. If False, the' + 'non-distributed tests.') + +absl_flags.DEFINE_boolean('full_tests', False, + 'If True, all distributed or non-distributed tests ' + 'are run, which can take hours. If False, only a ' + 'subset of tests will be run. This subset runs much ' + 'faster and tests almost all the functionality as ' + 'the full set of tests, so it is recommended to keep ' + 'this option set to False.') + +FLAGS = absl_flags.FLAGS + + +def main(_): + loader = unittest.defaultTestLoader + if FLAGS.full_tests: + suite = unittest.TestSuite([ + loader.loadTestsFromModule(allreduce_test), + loader.loadTestsFromModule(cnn_util_test), + loader.loadTestsFromModule(variable_mgr_util_test), + loader.loadTestsFromModule(benchmark_cnn_test), + loader.loadTestsFromModule(all_reduce_benchmark_test), + ]) + if model_config.can_import_contrib: + from models.tf1_only import nasnet_test # pylint: disable=g-import-not-at-top + suite.addTest(loader.loadTestsFromModule(nasnet_test)) + dist_suite = unittest.TestSuite([ + loader.loadTestsFromModule(benchmark_cnn_distributed_test), + ]) + else: + suite = unittest.TestSuite([ + loader.loadTestsFromModule(allreduce_test), + loader.loadTestsFromModule(cnn_util_test), + loader.loadTestsFromModule(all_reduce_benchmark_test), + loader.loadTestsFromModule(variable_mgr_util_test), + loader.loadTestsFromTestCase(benchmark_cnn_test.TestAlexnetModel), + loader.loadTestsFromTestCase(benchmark_cnn_test.TfCnnBenchmarksTest), + loader.loadTestsFromTestCase(benchmark_cnn_test.VariableUpdateTest), + loader.loadTestsFromTestCase( + benchmark_cnn_test.VariableMgrLocalReplicatedTest), + ]) + dist_suite = unittest.TestSuite([ + loader.loadTestsFromNames([ + 'benchmark_cnn_distributed_test.DistributedVariableUpdateTest' + '.testVarUpdateDefault', + + 'benchmark_cnn_distributed_test.TfCnnBenchmarksDistributedTest' + '.testParameterServer', + ]), + ]) + + if FLAGS.run_distributed_tests: + print('Running distributed tests') + result = unittest.TextTestRunner(verbosity=2).run(dist_suite) + else: + print('Running non-distributed tests') + result = unittest.TextTestRunner(verbosity=2).run(suite) + sys.exit(not result.wasSuccessful()) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + app.run(main) diff --git a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh new file mode 100644 index 0000000000000000000000000000000000000000..98639e5c9f5656c7a46bcc5a1f00609c1170a3f9 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +bash ./get_imagenette.sh + +export TF_CUDNN_USE_AUTOTUNE=1 +export TF_CPP_MIN_LOG_LEVEL=1 + +################################################# +# Prepare training arguments +################################################# + +i=0 +model="alexnet" +for arg in "$@" +do + if [ $i -eq 0 ]; then + model=$arg + let i++ + continue + fi + if [[ $arg =~ "--epoch" ]]; then + new_args[$i]="--num_epochs" + else + new_args[$i]=$arg + fi + let i++ +done +echo "## Training model: ${model}" + + +: ${BATCH_SIZE:=32} +# TRAIN_EPOCHS=10 +# optional optimizer: momentum, rmsprop, momentum, sgd +OPTIMIZER=momentum +DATE=`date +%Y%m%d%H%M%S` + +LOG_DIR="logs/${model}_distributed" +DATA_DIR=./imagenette +BASE_DIR=train_dir +TRAIN_DIR=${BASE_DIR}/${model}_distributed + +mkdir -p ${LOG_DIR} +mkdir -p ${BASE_DIR} +rm -rf ${TRAIN_DIR} + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0)); then + EXIT_STATUS=1 + fi +} + +################################################# +# Prepare devices +################################################# +devices=$CUDA_VISIBLE_DEVICES +if [ -n "$devices" ]; then + devices=(${devices//,/ }) + num_devices=${#devices[@]} +else + devices=(0 1) + num_devices=2 +fi +echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" +echo "num_devices: ${num_devices}" + +if [ "${num_devices}" == "1" ]; then + echo "Error: The number of devices must be greater then 1 for distributed training, but got CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}." + exit 0 +fi + +################################################# +# Prepare distributed training arguments +################################################# +worker_hosts="" +i=0 +for device in "${devices[@]}"; +do + if [ "$i" == "0" ]; then + let i++ + continue + fi + let i++ + worker_hosts="${worker_hosts},127.0.0.1:5000${device}" +done +worker_hosts=${worker_hosts#*,} +echo "worker_hosts: ${worker_hosts}" + +################################################# +# Handle CTRL-C +################################################# +trap ctrl_c INT +function ctrl_c() { + echo "*** Trapped CTRL-C, killing process running background" + for pid in "${pid_list[@]}"; do + echo "Killing pid ${pid}" + kill ${pid} + done + exit 0 +} + +################################################# +# Start distributed training +################################################# + +pid_list=() +last_device=`expr ${num_devices} - 1` +i=0 +for device in "${devices[@]}"; +do + job_name="worker" + if [ "${i}" == "0" ]; then + job_name="ps" + fi + + if [ ${i} -le 1 ]; then + task_index=0 + else + task_index=`expr ${i} - 1` + fi + + if [ "${i}" == "${last_device}" ]; then + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated \ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}" + else + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model}\ + --variable_update=distributed_replicated\ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & + echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}" + fi + let i++ + pid_list+=($!) +done + +echo "All subprocess: ${pid_list[*]}" +ctrl_c +exit ${EXIT_STATUS} diff --git a/cv/classification/resnet50/tensorflow/run_train_resnet50_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_resnet50_distributed_imagenette.sh new file mode 100644 index 0000000000000000000000000000000000000000..667cf5bb067665ad83e1f9bac95c8c797f6d91b3 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/run_train_resnet50_distributed_imagenette.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +bash ./run_train_distributed_imagenette.sh resnet50 "$@" +exit $? \ No newline at end of file diff --git a/cv/classification/resnet50/tensorflow/run_train_resnet50_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_resnet50_imagenette.sh new file mode 100644 index 0000000000000000000000000000000000000000..e65bfffb9851dfebc41ffc2eba38dd033588057e --- /dev/null +++ b/cv/classification/resnet50/tensorflow/run_train_resnet50_imagenette.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +bash ./get_imagenette.sh + +export TF_CUDNN_USE_AUTOTUNE=1 +export TF_CPP_MIN_LOG_LEVEL=1 + +: ${BATCH_SIZE:=32} +#TRAIN_EPOCHS=10 +# optional optimizer: adam, rmsprop, momentum, sgd +OPTIMIZER=adam +DATE=`date +%Y%m%d%H%M%S` + +LOG_DIR="logs/resnet50" +DATA_DIR=./imagenette +BASE_DIR=train_dir +TRAIN_DIR=${BASE_DIR}/resnet50 + +mkdir -p ${LOG_DIR} +mkdir -p ${BASE_DIR} +rm -rf ${TRAIN_DIR} + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0)); then + EXIT_STATUS=1 + fi +} + +i=0 +for arg in "$@" +do + if [[ $arg =~ "--epoch" ]]; then + new_args[$i]="--num_epochs" + else + new_args[$i]=$arg + fi + let i++ +done + +python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=resnet50 --optimizer=${OPTIMIZER} --num_gpus=1\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + + +exit ${EXIT_STATUS} diff --git a/cv/classification/resnet50/tensorflow/run_train_resnet50_multigpu_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_resnet50_multigpu_imagenette.sh new file mode 100644 index 0000000000000000000000000000000000000000..d2325ad0330446efb41ee6476de78551cc49c4ad --- /dev/null +++ b/cv/classification/resnet50/tensorflow/run_train_resnet50_multigpu_imagenette.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +bash ./get_imagenette.sh + +export TF_CUDNN_USE_AUTOTUNE=1 +export TF_CPP_MIN_LOG_LEVEL=1 + +: ${BATCH_SIZE:=32} +#TRAIN_EPOCHS=10 +# optional optimizer: adam, rmsprop, momentum, sgd +OPTIMIZER=adam +DATE=`date +%Y%m%d%H%M%S` + +LOG_DIR="logs/resnet50_multigpu" +DATA_DIR=./imagenette +BASE_DIR=train_dir +TRAIN_DIR=${BASE_DIR}/resnet50_multigpu + +mkdir -p ${LOG_DIR} +mkdir -p ${BASE_DIR} +rm -rf ${TRAIN_DIR} + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0)); then + EXIT_STATUS=1 + fi +} + +i=0 +for arg in "$@" +do + if [[ $arg =~ "--epoch" ]]; then + new_args[$i]="--num_epochs" + else + new_args[$i]=$arg + fi + let i++ +done + +source ./get_num_devices.sh + +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=resnet50 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/cv/classification/resnet50/tensorflow/ssd_constants.py b/cv/classification/resnet50/tensorflow/ssd_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..77fa0149b79f827b4e021afa67aa0e9409620e78 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/ssd_constants.py @@ -0,0 +1,118 @@ +# Copyright 2018 Google. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Central location for all constants related to MLPerf SSD.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# ============================================================================== +# == Model ===================================================================== +# ============================================================================== +IMAGE_SIZE = 300 + +# TODO(taylorrobie): MLPerf uses 80, but COCO documents 90. (RetinaNet uses 90) +# Update(taylorrobie): Labels > 81 show up in the pipeline. This will need to +# be resolved. +NUM_CLASSES = 81 # Including "no class". Not all COCO classes are used. + +# Note: Zero is special. (Background class) CLASS_INV_MAP[0] must be zero. +CLASS_INV_MAP = ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, + 88, 89, 90) +_MAP = {j: i for i, j in enumerate(CLASS_INV_MAP)} +CLASS_MAP = tuple(_MAP.get(i, -1) for i in range(max(CLASS_INV_MAP) + 1)) + +NUM_SSD_BOXES = 8732 + +RESNET_DEPTH = 34 + +"""SSD specific""" +MIN_LEVEL = 3 +MAX_LEVEL = 8 + +FEATURE_SIZES = (38, 19, 10, 5, 3, 1) +STEPS = (8, 16, 32, 64, 100, 300) + +# https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py +SCALES = (21, 45, 99, 153, 207, 261, 315) +ASPECT_RATIOS = ((2,), (2, 3), (2, 3), (2, 3), (2,), (2,)) +NUM_DEFAULTS = (4, 6, 6, 6, 4, 4) +NUM_DEFAULTS_BY_LEVEL = {3: 4, 4: 6, 5: 6, 6: 6, 7: 4, 8: 4} +SCALE_XY = 0.1 +SCALE_HW = 0.2 +BOX_CODER_SCALES = (1 / SCALE_XY, 1 / SCALE_XY, 1 / SCALE_HW, 1 / SCALE_HW) +MATCH_THRESHOLD = 0.5 + +# https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683 +NORMALIZATION_MEAN = (0.485, 0.456, 0.406) +NORMALIZATION_STD = (0.229, 0.224, 0.225) + +# SSD Cropping +NUM_CROP_PASSES = 50 +CROP_MIN_IOU_CHOICES = (0, 0.1, 0.3, 0.5, 0.7, 0.9) +P_NO_CROP_PER_PASS = 1 / (len(CROP_MIN_IOU_CHOICES) + 1) + +# Hard example mining +NEGS_PER_POSITIVE = 3 + +# Batch normalization +BATCH_NORM_DECAY = 0.997 +BATCH_NORM_EPSILON = 1e-4 + + +# ============================================================================== +# == Optimizer ================================================================= +# ============================================================================== +LEARNING_RATE_SCHEDULE = ( + (0, 1e-3), + (160000, 1e-4), + (200000, 1e-5), +) +MOMENTUM = 0.9 +WEIGHT_DECAY = 5e-4 + + +# ============================================================================== +# == Keys ====================================================================== +# ============================================================================== +BOXES = "boxes" +CLASSES = "classes" +NUM_MATCHED_BOXES = "num_matched_boxes" +IMAGE = "image" +SOURCE_ID = "source_id" +RAW_SHAPE = "raw_shape" +PRED_BOXES = "pred_boxes" +PRED_SCORES = "pred_scores" + + +# ============================================================================== +# == Evaluation ================================================================ +# ============================================================================== + +# Note: This is based on a batch size of 32 +# https://github.com/mlperf/reference/blob/master/single_stage_detector/ssd/train.py#L21-L37 +CHECKPOINT_FREQUENCY = 20000 +MAX_NUM_EVAL_BOXES = 200 +OVERLAP_CRITERIA = 0.5 # Used for nonmax supression +MIN_SCORE = 0.05 # Minimum score to be considered during evaluation. +DUMMY_SCORE = -1e5 # If no boxes are matched. + +ANNOTATION_FILE = "annotations/instances_val2017.json" +COCO_NUM_TRAIN_IMAGES = 118287 +COCO_NUM_VAL_IMAGES = 4952 diff --git a/cv/classification/resnet50/tensorflow/ssd_dataloader.py b/cv/classification/resnet50/tensorflow/ssd_dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..907d30903735d5181abbf18b02118a5eec2540ab --- /dev/null +++ b/cv/classification/resnet50/tensorflow/ssd_dataloader.py @@ -0,0 +1,405 @@ +# Copyright 2018 Google. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Data loader and processing.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import itertools as it +import math + +import numpy as np +import tensorflow.compat.v1 as tf + +from object_detection.box_coders import faster_rcnn_box_coder +from object_detection.core import box_list +from object_detection.core import region_similarity_calculator +from object_detection.core import target_assigner +from object_detection.matchers import argmax_matcher +import mlperf +import ssd_constants + + +class DefaultBoxes(object): + """Default bounding boxes for 300x300 5 layer SSD. + + Default bounding boxes generation follows the order of (W, H, anchor_sizes). + Therefore, the tensor converted from DefaultBoxes has a shape of + [anchor_sizes, H, W, 4]. The last dimension is the box coordinates; 'ltrb' + is [ymin, xmin, ymax, xmax] while 'xywh' is [cy, cx, h, w]. + """ + + def __init__(self): + fk = ssd_constants.IMAGE_SIZE / np.array(ssd_constants.STEPS) + + self.default_boxes = [] + # size of feature and number of feature + for idx, feature_size in enumerate(ssd_constants.FEATURE_SIZES): + sk1 = ssd_constants.SCALES[idx] / ssd_constants.IMAGE_SIZE + sk2 = ssd_constants.SCALES[idx+1] / ssd_constants.IMAGE_SIZE + sk3 = math.sqrt(sk1*sk2) + all_sizes = [(sk1, sk1), (sk3, sk3)] + + for alpha in ssd_constants.ASPECT_RATIOS[idx]: + w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha) + all_sizes.append((w, h)) + all_sizes.append((h, w)) + + assert len(all_sizes) == ssd_constants.NUM_DEFAULTS[idx] + + for w, h in all_sizes: + for i, j in it.product(range(feature_size), repeat=2): + cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx] + box = tuple(np.clip(k, 0, 1) for k in (cy, cx, h, w)) + self.default_boxes.append(box) + + assert len(self.default_boxes) == ssd_constants.NUM_SSD_BOXES + + mlperf.logger.log(key=mlperf.tags.FEATURE_SIZES, + value=ssd_constants.FEATURE_SIZES) + mlperf.logger.log(key=mlperf.tags.STEPS, + value=ssd_constants.STEPS) + mlperf.logger.log(key=mlperf.tags.SCALES, + value=ssd_constants.SCALES) + mlperf.logger.log(key=mlperf.tags.ASPECT_RATIOS, + value=ssd_constants.ASPECT_RATIOS) + mlperf.logger.log(key=mlperf.tags.NUM_DEFAULTS, + value=ssd_constants.NUM_SSD_BOXES) + + def to_ltrb(cy, cx, h, w): + return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2 + + # For IoU calculation + self.default_boxes_ltrb = tuple(to_ltrb(*i) for i in self.default_boxes) + + def __call__(self, order='ltrb'): + if order == 'ltrb': return self.default_boxes_ltrb + if order == 'xywh': return self.default_boxes + + +def calc_iou_tensor(boxes1, boxes2): + """Calculation of IoU based on two boxes tensor. + + Reference to https://github.com/kuangliu/pytorch-ssd + + Args: + boxes1: shape (N, 4), four coordinates of N boxes + boxes2: shape (M, 4), four coordinates of M boxes + Returns: + IoU: shape (N, M), IoU of the i-th box in `boxes1` and j-th box in `boxes2` + """ + b1_left, b1_top, b1_right, b1_bottom = tf.split(boxes1, 4, axis=1) + b2_left, b2_top, b2_right, b2_bottom = tf.split(boxes2, 4, axis=1) + + # Shape of intersect_* (N, M) + intersect_left = tf.maximum(b1_left, tf.transpose(b2_left)) + intersect_top = tf.maximum(b1_top, tf.transpose(b2_top)) + intersect_right = tf.minimum(b1_right, tf.transpose(b2_right)) + intersect_bottom = tf.minimum(b1_bottom, tf.transpose(b2_bottom)) + + boxes1_area = (b1_right - b1_left) * (b1_bottom - b1_top) + boxes2_area = (b2_right - b2_left) * (b2_bottom - b2_top) + + intersect = tf.multiply(tf.maximum((intersect_right - intersect_left), 0), + tf.maximum((intersect_bottom - intersect_top), 0)) + union = boxes1_area + tf.transpose(boxes2_area) - intersect + iou = intersect / union + + return iou + + +def ssd_parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + + Each Example proto contains the following fields that we care about: + + image/encoded: + image/source_id: tf.string + image/height: tf.int64 + image/width: tf.int64 + image/object/bbox/xmin: tf.VarLenFeature(tf.float32) + image/object/bbox/xmax: tf.VarLenFeature(tf.float32) + image/object/bbox/ymin: tf.VarLenFeature(tf.float32 + image/object/bbox/ymax: tf.VarLenFeature(tf.float32) + image/object/class/label: tf.VarLenFeature(tf.int64) + image/object/class/text: tf.VarLenFeature(tf.string) + + Complete decoder can be found in: + https://github.com/tensorflow/models/blob/master/research/object_detection/data_decoders/tf_example_decoder.py + + Args: + example_serialized: scalar Tensor tf.string containing a serialized + Example protocol buffer. + + Returns: + A dictionary with the following key-values: + image_buffer: Tensor tf.string containing the contents of a JPEG file. + groundtruth_boxes: Tensor tf.float32 of shape [num_boxes, 4], containing + coordinates of object bounding boxes. + groundtruth_classeS: Tensor tf.int64 of shape [num_boxes, 1], containing + class labels of objects. + source_id: unique image identifier. + raw_shape: [height, width, 3]. + """ + feature_map = { + 'image/encoded': tf.FixedLenFeature( + (), dtype=tf.string, default_value=''), + 'image/source_id': tf.FixedLenFeature((), tf.string, default_value=''), + 'image/height': tf.FixedLenFeature((), tf.int64, default_value=1), + 'image/width': tf.FixedLenFeature((), tf.int64, default_value=1), + 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), + 'image/object/class/label': tf.VarLenFeature(dtype=tf.int64), + } + features = tf.parse_single_example(example_serialized, feature_map) + + xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 1) + ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 1) + xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 1) + ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 1) + + image_buffer = features['image/encoded'] + # Bounding box coordinates should be in ltrb order + boxes = tf.concat([ymin, xmin, ymax, xmax], 1) + classes = tf.expand_dims(features['image/object/class/label'].values, 1) + source_id = features['image/source_id'] + raw_shape = tf.stack([features['image/height'], features['image/width'], 3]) + + return {'image_buffer': image_buffer, + 'groundtruth_boxes': boxes, + 'groundtruth_classes': classes, + 'source_id': source_id, + 'raw_shape': raw_shape} + + +def ssd_decode_and_crop(image_buffer, boxes, classes, raw_shape): + """Crop image randomly and decode the cropped region. + + This function will crop an image to meet the following requirements: + 1. height to width ratio between 0.5 and 2; + 2. IoUs of some boxes exceed specified threshold; + 3. At least one box center is in the cropped region. + We defer the jpeg decoding task until after the crop to avoid wasted work. + + Reference: https://github.com/chauhan-utk/ssd.DomainAdaptation + + Args: + image_buffer: Tensor tf.string containing the contents of a JPEG file. + boxes: Tensor tf.float32 of shape [num_boxes, 4], containing coordinates of + object bounding boxes. + classes: Tensor tf.int64 of shape [num_boxes, 1], containing class labels + of objects. + raw_shape: [height, width, 3]. + + Returns: + resized_image: decoded, cropped, and resized image Tensor tf.float32 of + shape [ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE, 3], value + range 0--255. + cropped_boxes: box coordinates for objects in the cropped region. + cropped_classes: class labels for objects in the cropped region. + """ + + num_boxes = tf.shape(boxes)[0] + + def no_crop_check(): + return (tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32) + < ssd_constants.P_NO_CROP_PER_PASS) + + def no_crop_proposal(): + return ( + tf.ones((), tf.bool), + tf.convert_to_tensor([0, 0, 1, 1], dtype=tf.float32), + tf.ones((num_boxes,), tf.bool), + ) + + def crop_proposal(): + rand_vec = lambda minval, maxval: tf.random_uniform( + shape=(ssd_constants.NUM_CROP_PASSES, 1), minval=minval, maxval=maxval, + dtype=tf.float32) + + width, height = rand_vec(0.3, 1), rand_vec(0.3, 1) + left, top = rand_vec(0, 1-width), rand_vec(0, 1-height) + + right = left + width + bottom = top + height + + ltrb = tf.concat([left, top, right, bottom], axis=1) + + min_iou = tf.random_shuffle(ssd_constants.CROP_MIN_IOU_CHOICES)[0] + ious = calc_iou_tensor(ltrb, boxes) + + # discard any bboxes whose center not in the cropped image + xc, yc = [tf.tile(0.5 * (boxes[:, i + 0] + boxes[:, i + 2])[tf.newaxis, :], + (ssd_constants.NUM_CROP_PASSES, 1)) for i in range(2)] + + masks = tf.reduce_all(tf.stack([ + tf.greater(xc, tf.tile(left, (1, num_boxes))), + tf.less(xc, tf.tile(right, (1, num_boxes))), + tf.greater(yc, tf.tile(top, (1, num_boxes))), + tf.less(yc, tf.tile(bottom, (1, num_boxes))), + ], axis=2), axis=2) + + # Checks of whether a crop is valid. + valid_aspect = tf.logical_and(tf.less(height/width, 2), + tf.less(width/height, 2)) + valid_ious = tf.reduce_all(tf.greater(ious, min_iou), axis=1, keepdims=True) + valid_masks = tf.reduce_any(masks, axis=1, keepdims=True) + + valid_all = tf.cast(tf.reduce_all(tf.concat( + [valid_aspect, valid_ious, valid_masks], axis=1), axis=1), tf.int32) + + # One indexed, as zero is needed for the case of no matches. + index = tf.range(1, 1 + ssd_constants.NUM_CROP_PASSES, dtype=tf.int32) + + # Either one-hot, or zeros if there is no valid crop. + selection = tf.equal(tf.reduce_max(index * valid_all), index) + + use_crop = tf.reduce_any(selection) + output_ltrb = tf.reduce_sum(tf.multiply(ltrb, tf.tile(tf.cast( + selection, tf.float32)[:, tf.newaxis], (1, 4))), axis=0) + output_masks = tf.reduce_any(tf.logical_and(masks, tf.tile( + selection[:, tf.newaxis], (1, num_boxes))), axis=0) + + return use_crop, output_ltrb, output_masks + + def proposal(*args): + return tf.cond( + pred=no_crop_check(), + true_fn=no_crop_proposal, + false_fn=crop_proposal, + ) + + _, crop_bounds, box_masks = tf.while_loop( + cond=lambda x, *_: tf.logical_not(x), + body=proposal, + loop_vars=[tf.zeros((), tf.bool), tf.zeros((4,), tf.float32), tf.zeros((num_boxes,), tf.bool)], + ) + + filtered_boxes = tf.boolean_mask(boxes, box_masks, axis=0) + + mlperf.logger.log(key=mlperf.tags.NUM_CROPPING_ITERATIONS, + value=ssd_constants.NUM_CROP_PASSES) + + # Clip boxes to the cropped region. + filtered_boxes = tf.stack([ + tf.maximum(filtered_boxes[:, 0], crop_bounds[0]), + tf.maximum(filtered_boxes[:, 1], crop_bounds[1]), + tf.minimum(filtered_boxes[:, 2], crop_bounds[2]), + tf.minimum(filtered_boxes[:, 3], crop_bounds[3]), + ], axis=1) + + left = crop_bounds[0] + top = crop_bounds[1] + width = crop_bounds[2] - left + height = crop_bounds[3] - top + + cropped_boxes = tf.stack([ + (filtered_boxes[:, 0] - left) / width, + (filtered_boxes[:, 1] - top) / height, + (filtered_boxes[:, 2] - left) / width, + (filtered_boxes[:, 3] - top) / height, + ], axis=1) + + # crop_window containing integer coordinates of cropped region. A normalized + # coordinate value of y should be mapped to the image coordinate at + # y * (height - 1). + raw_shape = tf.cast(raw_shape, tf.float32) + crop_window = tf.stack([left * (raw_shape[0] - 1), + top * (raw_shape[1] - 1), + width * raw_shape[0], + height * raw_shape[1]]) + crop_window = tf.cast(crop_window, tf.int32) + + # Fused op only decodes the cropped portion of an image + cropped_image = tf.image.decode_and_crop_jpeg( + image_buffer, crop_window, channels=3) + + # Resize converts image dtype from uint8 to float32, without rescaling values. + resized_image = tf.image.resize_images( + cropped_image, [ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE]) + mlperf.logger.log(key=mlperf.tags.INPUT_SIZE, + value=ssd_constants.IMAGE_SIZE) + + cropped_classes = tf.boolean_mask(classes, box_masks, axis=0) + + return resized_image, cropped_boxes, cropped_classes + + +def color_jitter(image, brightness=0, contrast=0, saturation=0, hue=0): + """Distort the color of the image.""" + with tf.name_scope('distort_color'): + if brightness > 0: + image = tf.image.random_brightness(image, max_delta=brightness) + if contrast > 0: + image = tf.image.random_contrast( + image, lower=1-contrast, upper=1+contrast) + if saturation > 0: + image = tf.image.random_saturation( + image, lower=1-saturation, upper=1+saturation) + if hue > 0: + image = tf.image.random_hue(image, max_delta=hue) + return image + + +def normalize_image(images): + """Normalize image to zero mean and unit variance. + + Args: + images: a tensor representing images, at least 3-D. + Returns: + images normalized by mean and stdev. + """ + data_type = images.dtype + mean = tf.constant(ssd_constants.NORMALIZATION_MEAN, data_type) + std = tf.constant(ssd_constants.NORMALIZATION_STD, data_type) + images = tf.divide(tf.subtract(images, mean), std) + + mlperf.logger.log(key=mlperf.tags.DATA_NORMALIZATION_MEAN, + value=ssd_constants.NORMALIZATION_MEAN) + mlperf.logger.log(key=mlperf.tags.DATA_NORMALIZATION_STD, + value=ssd_constants.NORMALIZATION_STD) + return images + + +class Encoder(object): + """Encoder for SSD boxes and labels.""" + + def __init__(self): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher( + matched_threshold=ssd_constants.MATCH_THRESHOLD, + unmatched_threshold=ssd_constants.MATCH_THRESHOLD, + negatives_lower_than_unmatched=True, + force_match_for_each_row=True) + + box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( + scale_factors=ssd_constants.BOX_CODER_SCALES) + + self.default_boxes = DefaultBoxes()('ltrb') + self.default_boxes = box_list.BoxList( + tf.convert_to_tensor(self.default_boxes)) + self.assigner = target_assigner.TargetAssigner( + similarity_calc, matcher, box_coder) + + def encode_labels(self, gt_boxes, gt_labels): + target_boxes = box_list.BoxList(gt_boxes) + encoded_classes, _, encoded_boxes, _, matches = self.assigner.assign( + self.default_boxes, target_boxes, gt_labels) + num_matched_boxes = tf.reduce_sum( + tf.cast(tf.not_equal(matches, -1), tf.float32)) + return encoded_classes, encoded_boxes, num_matched_boxes diff --git a/cv/classification/resnet50/tensorflow/test_data/__init__.py b/cv/classification/resnet50/tensorflow/test_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00000-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00000-of-00008 new file mode 100644 index 0000000000000000000000000000000000000000..4e65b92a9a5f252f7b1a9d9048e834217f468971 Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00000-of-00008 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00001-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00001-of-00008 new file mode 100644 index 0000000000000000000000000000000000000000..1cf1fec734f3d6bfd74a6e38ac7b0f43d24eaaab Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00001-of-00008 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00002-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00002-of-00008 new file mode 100644 index 0000000000000000000000000000000000000000..631ca95c9e17362c498b71979466661ec7ce4be5 Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00002-of-00008 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00003-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00003-of-00008 new file mode 100644 index 0000000000000000000000000000000000000000..addfd3424c00e4596de3bfa77751c1fd891164ba Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00003-of-00008 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00004-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00004-of-00008 new file mode 100644 index 0000000000000000000000000000000000000000..32818ec981b1b648ea605e351012c4e58a075454 Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00004-of-00008 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00005-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00005-of-00008 new file mode 100644 index 0000000000000000000000000000000000000000..5f4e651519673b3b61726b5a3b0d21a8c962deb5 Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00005-of-00008 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00006-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00006-of-00008 new file mode 100644 index 0000000000000000000000000000000000000000..042a71fd169745357111f2f0de84f42e52849b2a Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00006-of-00008 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00007-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00007-of-00008 new file mode 100644 index 0000000000000000000000000000000000000000..a6d9ce627d88ec39344fbd6aae7badd629c5e54c Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00007-of-00008 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00000-of-00002 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00000-of-00002 new file mode 100644 index 0000000000000000000000000000000000000000..1c7757759bad5f59007b429adb520fdb5eed4068 Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00000-of-00002 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00001-of-00002 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00001-of-00002 new file mode 100644 index 0000000000000000000000000000000000000000..e0f379cab43b5fa46f6f232e93c1deba2548f7a1 Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00001-of-00002 differ diff --git a/cv/classification/resnet50/tensorflow/test_data/images/black_image.jpg b/cv/classification/resnet50/tensorflow/test_data/images/black_image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..312873be3bd305bfb5962896ea8ae507ca44b572 Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/images/black_image.jpg differ diff --git a/cv/classification/resnet50/tensorflow/test_data/images/white_image.jpg b/cv/classification/resnet50/tensorflow/test_data/images/white_image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ad96f25af79ca0d683642c3dbef1049cc7061f84 Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/images/white_image.jpg differ diff --git a/cv/classification/resnet50/tensorflow/test_data/tfrecord_image_generator.py b/cv/classification/resnet50/tensorflow/test_data/tfrecord_image_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..8f0b9102134456fefd7b712c9e1d734c13a0b9e2 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/test_data/tfrecord_image_generator.py @@ -0,0 +1,226 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Generate black and white test TFRecords with Example protos. + +Each record within the TFRecord file is a +serialized Example proto. The Example proto contains the following fields: + + image/encoded: string containing JPEG encoded image in RGB colorspace + image/height: integer, image height in pixels + image/width: integer, image width in pixels + image/colorspace: string, specifying the colorspace, always 'RGB' + image/channels: integer, specifying the number of channels, always 3 + image/format: string, specifying the format, always'JPEG' + + image/filename: string containing the basename of the image file + e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG' + image/class/label: integer specifying the index in a classification layer. + The label ranges from [1, 1000] where 0 is not used. + image/class/synset: string specifying the unique ID of the label, + e.g. 'n01440764' + image/class/text: string specifying the human-readable version of the label + e.g. 'red fox, Vulpes vulpes' + + image/object/bbox/xmin: list of integers specifying the 0+ human annotated + bounding boxes + image/object/bbox/xmax: list of integers specifying the 0+ human annotated + bounding boxes + image/object/bbox/ymin: list of integers specifying the 0+ human annotated + bounding boxes + image/object/bbox/ymax: list of integers specifying the 0+ human annotated + bounding boxes + image/object/bbox/label: integer specifying the index in a classification + layer. The label ranges from [1, 1000] where 0 is not used. Note this is + always identical to the image label. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import random + +import numpy as np +import six +import tensorflow.compat.v1 as tf + + +def _int64_feature(value): + """Wrapper for inserting int64 features into Example proto.""" + if not isinstance(value, list): + value = [value] + return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + + +def _float_feature(value): + """Wrapper for inserting float features into Example proto.""" + if not isinstance(value, list): + value = [value] + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + + +def _bytes_feature(value): + """Wrapper for inserting bytes features into Example proto.""" + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + +def _convert_to_example(filename, image_buffer, label, synset, human, bbox, + height, width): + """Build an Example proto for an example. + + Args: + filename: string, path to an image file, e.g., '/path/to/example.JPG' + image_buffer: bytes, JPEG encoding of RGB image + label: integer, identifier for the ground truth for the network + synset: string, unique WordNet ID specifying the label, e.g., 'n02323233' + human: string, human-readable label, e.g., 'red fox, Vulpes vulpes' + bbox: list of bounding boxes; each box is a list of integers + specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to + the same label as the image label. + height: integer, image height in pixels + width: integer, image width in pixels + Returns: + Example proto + """ + xmin = [] + ymin = [] + xmax = [] + ymax = [] + for b in bbox: + assert len(b) == 4 + # pylint: disable=expression-not-assigned + [l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)] + # pylint: enable=expression-not-assigned + + colorspace = b'RGB' + channels = 3 + image_format = b'JPEG' + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': _int64_feature(height), + 'image/width': _int64_feature(width), + 'image/colorspace': _bytes_feature(colorspace), + 'image/channels': _int64_feature(channels), + 'image/class/label': _int64_feature(label), + 'image/class/synset': _bytes_feature(six.ensure_binary(synset)), + 'image/class/text': _bytes_feature(six.ensure_binary(human)), + 'image/object/bbox/xmin': _float_feature(xmin), + 'image/object/bbox/xmax': _float_feature(xmax), + 'image/object/bbox/ymin': _float_feature(ymin), + 'image/object/bbox/ymax': _float_feature(ymax), + 'image/object/bbox/label': _int64_feature([label] * len(xmin)), + 'image/format': _bytes_feature(image_format), + 'image/filename': _bytes_feature(os.path.basename(six.ensure_binary( + filename))), + 'image/encoded': _bytes_feature(image_buffer)})) + return example + + +class ImageCoder(object): + """Helper class that provides TensorFlow image coding utilities.""" + + def __init__(self): + # Create a single Session to run all image coding calls. + self._sess = tf.Session() + + # Initializes function that converts PNG to JPEG data. + self._image = tf.placeholder(dtype=tf.uint8) + self._encode_jpeg = tf.image.encode_jpeg( + self._image, format='rgb', quality=100) + + def encode_jpeg(self, image): + jpeg_image = self._sess.run(self._encode_jpeg, + feed_dict={self._image: image}) + return jpeg_image + + +def _process_image(coder, name): + """Process a single image file. + + If name is "train", a black image is returned. Otherwise, a white image is + returned. + + Args: + coder: instance of ImageCoder to provide TensorFlow image coding utils. + name: string, unique identifier specifying the data set. + Returns: + image_buffer: bytes, JPEG encoding of RGB image. + height: integer, image height in pixels. + width: integer, image width in pixels. + """ + # Read the image file. + value = 0 if name == 'train' else 255 + height = random.randint(30, 299) + width = random.randint(30, 299) + image = np.full((height, width, 3), value, np.uint8) + + jpeg_data = coder.encode_jpeg(image) + + return jpeg_data, height, width + + +def _process_dataset(output_directory, num_classes, coder, name, num_images, + num_shards): + """Process a complete data set and save it as a TFRecord. + + Args: + output_directory: Where to put outputs. + num_classes: number of classes. + coder: Instance of an ImageCoder. + name: string, unique identifier specifying the data set. + num_images: number of images to generate. + num_shards: integer number of shards to create. + """ + files_per_shard = num_images // num_shards + for shard in range(num_shards): + output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) + output_file = os.path.join(output_directory, output_filename) + with tf.python_io.TFRecordWriter(output_file) as writer: + for i in range(files_per_shard): + index = shard * files_per_shard + i + image_buffer, height, width = _process_image(coder, name) + + filename = '{}_{}_{}'.format(name, shard, i) + label = index % num_classes + synset = str(index) + human = name + bbox = [[0.1, 0.1, 0.9, 0.9]] + example = _convert_to_example(filename, image_buffer, label, + synset, human, bbox, + height, width) + writer.write(example.SerializeToString()) + + +def write_black_and_white_tfrecord_data( + output_directory, num_classes, num_train_images=512, + num_validation_images=128, train_shards=8, validation_shards=2): + """Writes black and white images in tfrecord format. + + Training images are black and validation images are white. + + Args: + output_directory: Where to put outputs. + num_classes: number of classes. + num_train_images: number of training images to generate. + num_validation_images: number of validation images to generate. + train_shards: integer number of training shards to create. + validation_shards: integer number of validation shards to create. + """ + + coder = ImageCoder() + _process_dataset(output_directory, num_classes, coder, 'validation', + num_validation_images, validation_shards) + _process_dataset(output_directory, num_classes, coder, 'train', + num_train_images, train_shards) diff --git a/cv/classification/resnet50/tensorflow/test_util.py b/cv/classification/resnet50/tensorflow/test_util.py new file mode 100644 index 0000000000000000000000000000000000000000..ccb930a6b1e2fba3285dc2e14cfd0a3fba85ce4b --- /dev/null +++ b/cv/classification/resnet50/tensorflow/test_util.py @@ -0,0 +1,532 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Shared functionality across multiple test files.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from collections import namedtuple +from contextlib import contextmanager +import os + +import numpy as np +import tensorflow.compat.v1 as tf +import benchmark_cnn +import cnn_util +import datasets +import preprocessing +from models import model +from platforms import util as platforms_util +from test_data import tfrecord_image_generator +from tensorflow.core.protobuf import rewriter_config_pb2 # pylint: disable=g-direct-tensorflow-import +from tensorflow.python.platform import test + + +@contextmanager +def monkey_patch(obj, **kwargs): + """Context mgr to monkey patch attributes on an object (such as a module). + + The attributes are patched back to their original value when the context + manager exits. + + For example, to replace benchmark_cnn.get_data_type with an identity function, + do: + + ``` + with monkey_patch(benchmark_cnn, get_data_type=lambda x: x) + loss1 = benchmark_cnn.loss_function(1) # loss1 will be 1 + loss2 = benchmark_cnn.loss_function(params) # Call the original function + ``` + + Args: + obj: The object (which can be a module) to monkey patch attributes on. + **kwargs: Dictionary mapping from attribute name to value that the attribute + will be patched with. + Yields: + Nothing. + """ + old_values = {key: getattr(obj, key) for key in kwargs} + try: + for key, value in kwargs.items(): + setattr(obj, key, value) + yield + finally: + for key, value in old_values.items(): + setattr(obj, key, value) + + +def monkey_patch_base_cluster_manager(): + """Monkey patches get_cluster_manager to return a BaseClusterManager. + + This function replaces platforms_util.get_cluster_manager with a function that + always return a BaseClusterManager. + + This is useful for testing creating a graph in distributed mode, with only a + single process. GrpcClusterManager's constructor blocks until a cluster is set + up, which requires multiple processes to be created. + """ + def get_test_cluster_manager(params, config_proto): + del config_proto + return cnn_util.BaseClusterManager(params) + platforms_util.get_cluster_manager = get_test_cluster_manager + + +def print_and_add_to_list(print_list): + """Returns a function which prints the input, then adds it to print_list.""" + def f(string): + print(string) + print_list.append(string) + return f + + +TrainingOutput = namedtuple('TrainingOutput', + ['loss', 'top_1_accuracy', 'top_5_accuracy']) + + +EvalOutput = namedtuple('EvalOutput', ['top_1_accuracy', 'top_5_accuracy']) + + +def get_training_outputs_from_logs(logs, print_training_accuracy): + """Returns a list of TrainingOutputs by parsing the logs of a training run. + + Args: + logs: A list of strings, each which is a line from the standard output of + tf_cnn_benchmarks from training. Only lines in the form: + 10 images/sec: 14.2 +/- 0.0 (jitter = 0.0) 7.020 + are parsed (the line may also contain the training accuracies). + print_training_accuracy: The value of the param print_training_accuracy. + Returns: + A list of TrainingOutputs. The list has one element per element of logs + that is in the format above. top_1_accuracy and top_5_accuracy are set to -1 + if the line does not contain accuracies. + """ + outputs = [] + for log in logs: + if 'images/sec' in log and '+/-' in log: + parts = log.split() + if print_training_accuracy: + # Example log with training accuracy: + # 10 images/sec: 0.2 +/- 0.0 (jitter = 0.0) 6.908 0.500 1.000 + assert len(parts) == 11 + top_1_acc = float(parts[9]) + top_5_acc = float(parts[10]) + else: + # Example log without training accuracy: + # 10 images/sec: 0.2 +/- 0.0 (jitter = 0.0) 6.908 + assert len(parts) == 9 + top_1_acc = -1 + top_5_acc = -1 + loss = float(parts[8]) + outputs.append(TrainingOutput(loss=loss, top_1_accuracy=top_1_acc, + top_5_accuracy=top_5_acc)) + assert len(outputs) >= 1 + return outputs + + +def get_evaluation_outputs_from_logs(logs): + """Returns the top 1 and 5 accuracies by parsing the logs of an eval run. + + Args: + logs: A list of strings, each which is a line from the standard output of + tf_cnn_benchmarks from evaluation. Only lines in the form: + Accuracy @ 1 = 0.5000 Accuracy @ 5 = 1.0000 [80 examples] + is parsed. + Returns: + A list of EvalOutputs. Normally this list only has one EvalOutput, but can + contain multiple if training is done and + --eval_during_training_every_n_steps is specified. + """ + eval_outputs = [] + for log in logs: + if 'Accuracy @ ' in log: + # Example log: + # Accuracy @ 1 = 0.5000 Accuracy @ 5 = 1.0000 [80 examples] + parts = log.split() + assert len(parts) == 12 + top_1_accuracy = float(parts[4]) + top_5_accuracy = float(parts[9]) + eval_outputs.append(EvalOutput(top_1_accuracy, top_5_accuracy)) + assert eval_outputs + return eval_outputs + + +def check_training_outputs_are_reasonable(testcase, training_outputs, + print_training_accuracy, + max_final_loss=10., + previous_final_loss=None): + """Checks the outputs from training a model are reasonable. + + An assert is failed if the outputs are not reasonable. The final top-1 and + top-5 accuracies are asserted to be 1, and so the dataset used to train should + be trivial to learn. For example, the dataset could consist of a black image + with label 0 and a white image with label 1. + + Args: + testcase: A tf.test.TestCase used for assertions. + training_outputs: A list of TrainingOutputs, as returned from + get_training_outputs_from_logs(). + print_training_accuracy: Whether training accuracies were printed and stored + in training_outputs. + max_final_loss: The loss of the final training output is asserted to be at + most this value. + previous_final_loss: If training was resumed from a checkpoint, the loss of + the final step from the previous training run that saved the checkpoint. + """ + if previous_final_loss is not None: + # Ensure the loss hasn't raised significantly from the final loss of the + # previous training run. + testcase.assertLessEqual(training_outputs[0].loss, + previous_final_loss * 1.01) + for output in training_outputs: + testcase.assertLessEqual(output.loss, 100.) + last_output = training_outputs[-1] + if print_training_accuracy: + testcase.assertEqual(last_output.top_1_accuracy, 1.0) + testcase.assertEqual(last_output.top_5_accuracy, 1.0) + if max_final_loss is not None: + testcase.assertLessEqual(last_output.loss, max_final_loss) + + +def train_and_eval(testcase, + run_fn, + params, + check_output_values, + max_final_loss=10., + skip=None): + """Trains a model then evaluates it. + + This function should be used to verify training and evaluating + BenchmarkCNN works without crashing and that it outputs reasonable + values. BenchmarkCNN will be run three times. First, it will train a + model from scratch, saving a checkpoint. Second, it will load the checkpoint + to continue training. Finally, it evaluates based on the loaded checkpoint. + + Args: + testcase: A tf.test.TestCase used for assertions. + run_fn: Must run `BenchmarkCNN` exactly once. BenchmarkCNN is + never used directly, but instead is only run through `run_fn`. `run_fn` + has the signature (run_type, inner_params) -> output_list, where: + * run_type is a string indicating how BenchmarkCNN will be run. + Either 'InitialTraining', 'TrainingFromCheckpoint' or 'Evaluation'. + * inner_params is the params BenchmarkCNN should be run with. + * output_list[i] is a list of lines from the ith worker's stdout. + params: The params BenchmarkCNN will be run with. + Will be passed to `run_fn` slightly modified in order to run with both + training and evaluation. + check_output_values: Whether the outputs of the workers, such as training + accuracy, should be checked to make sure their values are reasonable. + Fails an assert on `testcase` if a check fails. + max_final_loss: The loss of the final training output is asserted to be at + most this value for both training runs. + skip: If 'eval', evaluation is not done. if + 'eval_and_train_from_checkpoint', evaluation and training from a + checkpoint are both not done. + """ + + assert not skip or skip in {'eval', 'eval_and_train_from_checkpoint'} + + # Part 1: Train from scratch. + tf.logging.info('Training model from scratch') + print_training_accuracy = (params.print_training_accuracy or + params.forward_only) + initial_train_logs = run_fn('InitialTraining', params) + testcase.assertGreaterEqual(len(initial_train_logs), 1) + for lines in initial_train_logs: + initial_train_outputs = get_training_outputs_from_logs( + lines, print_training_accuracy) + if params.cross_replica_sync and params.batch_group_size == 1: + testcase.assertEqual(len(initial_train_outputs), params.num_batches) + if check_output_values: + check_training_outputs_are_reasonable(testcase, initial_train_outputs, + print_training_accuracy, + max_final_loss=max_final_loss) + if params.train_dir is not None: + train_dir_entries = set(os.listdir(params.train_dir)) + testcase.assertGreater(len(train_dir_entries), 0) + else: + train_dir_entries = None + + if skip == 'eval_and_train_from_checkpoint': + return + + # Part 2: Train from the loaded checkpoint. + testcase.assertIsNotNone(train_dir_entries) + tf.logging.info('Training model from loaded checkpoint') + # Run for same number of batches as before. + params = params._replace(num_batches=params.num_batches * 2) + train_logs_from_ckpt = run_fn('TrainingFromCheckpoint', params) + testcase.assertGreaterEqual(len(train_logs_from_ckpt), 1) + for lines in train_logs_from_ckpt: + train_outputs_from_ckpt = get_training_outputs_from_logs( + lines, print_training_accuracy) + if params.cross_replica_sync and params.batch_group_size == 1: + testcase.assertEqual(len(train_outputs_from_ckpt), + params.num_batches // 2 - params.num_warmup_batches) + if check_output_values: + check_training_outputs_are_reasonable( + testcase, train_outputs_from_ckpt, print_training_accuracy, + max_final_loss=max_final_loss, + previous_final_loss=initial_train_outputs[-1].loss) + # Ensure a new checkpoint was written out. + testcase.assertNotEqual(train_dir_entries, set(os.listdir(params.train_dir))) + + if skip == 'eval': + return + + # Part 3: Evaluate from the loaded checkpoint. + tf.logging.info('Evaluating model from checkpoint') + params = params._replace(num_batches=params.num_batches // 2, eval=True) + eval_logs = run_fn('Evaluation', params) + testcase.assertGreaterEqual(len(eval_logs), 1) + for lines in eval_logs: + eval_outputs = get_evaluation_outputs_from_logs(lines) + assert len(eval_outputs) == 1 + top_1_accuracy, top_5_accuracy = eval_outputs[0] + if check_output_values: + testcase.assertEqual(top_1_accuracy, 1.0) + testcase.assertEqual(top_5_accuracy, 1.0) + + +def get_temp_dir(dir_name): + dir_path = os.path.join(test.get_temp_dir(), dir_name) + os.mkdir(dir_path) + return dir_path + + +def create_black_and_white_images(): + dir_path = get_temp_dir('black_and_white_images') + tfrecord_image_generator.write_black_and_white_tfrecord_data(dir_path, + num_classes=1) + return dir_path + + +def get_params(train_dir_name): + """Returns params that can be used to train.""" + params = benchmark_cnn.make_params( + batch_size=2, + display_every=1, + init_learning_rate=0.005, + model='trivial', + num_batches=20, + num_gpus=2, + num_warmup_batches=5, + optimizer='sgd', + print_training_accuracy=True, + train_dir=get_temp_dir(train_dir_name), + variable_update='parameter_server', + weight_decay=0, + distortions=True, + distort_color_in_yiq=False) + return benchmark_cnn.set_default_param_values_and_env_vars(params) + + +def get_var_update_params(): + """Returns params that are used when testing variable updates.""" + params = benchmark_cnn.make_params( + batch_size=2, + model='test_model', + num_gpus=2, + display_every=1, + num_warmup_batches=0, + num_batches=4, + weight_decay=2 ** -4, + init_learning_rate=2 ** -4, + optimizer='sgd') + return benchmark_cnn.set_default_param_values_and_env_vars(params) + + +def get_fake_var_update_inputs(): + """Returns fake input 1x1 images to use in variable update tests.""" + # BenchmarkCNN divides by 127.5 then subtracts 1.0 from the images, so after + # that, the images will be -1., 0., 1., ..., 14. + return np.resize(127.5 * np.array(range(16)), (16, 1, 1, 1)) + + +def _worker_batches_in_numpy_array(numpy_inputs, batch_size, shift_ratio): + """Yields batches from a numpy array, for a single worker.""" + numpy_inputs = cnn_util.roll_numpy_batches(numpy_inputs, batch_size, + shift_ratio) + i = 0 + total_batches = numpy_inputs.shape[0] + assert total_batches % batch_size == 0 + while True: + yield numpy_inputs[i:i + batch_size, ...] + i = (i + batch_size) % total_batches + + +def manually_compute_losses(numpy_inputs, inputs_placeholder, loss, num_workers, + params): + """Manually compute the losses each worker should report in tf_cnn_benchmarks. + + This function essentially simulates tf_cnn_benchmarks, computing what the loss + of each worker should be. The caller should create a model, that takes in + images from `inputs_placeholder`, a tf.placeholder, and computes `loss`. + + This function, and all ops passed to this function, must be run under a + tf.device('cpu:0') context manager. + + Non-SGD optimizers are not supported with multiple workers. + + Args: + numpy_inputs: A Numpy array to use as the input images. + inputs_placeholder: A tf.placeholder tensor, where input images can be fed + into. + loss: A scalar tensor representing the loss of the model, which is obtained + from the input images in inputs_placeholder. + num_workers: How many workers should be simulated. + params: Params tuple. This doesn't have to have information about the + distributed cluster, such as --num_workers, as num_workers is passed in + separately. + + Returns: + A list of list of losses. return_value[i][j] is the loss of the ith worker + after the jth step. + """ + batch_size = params.batch_size * params.num_gpus + assert numpy_inputs.shape[0] % (num_workers * batch_size) == 0 + l2_loss = tf.add_n([tf.nn.l2_loss(x) for x in tf.trainable_variables()]) + total_loss = loss + params.weight_decay * l2_loss + reported_loss = (loss if params.loss_type_to_report == 'base_loss' + else total_loss) + gradient_multiplier = 1 + if params.variable_update in ('replicated', 'distributed_all_reduce'): + # In certain variable updates, tf_cnn_benchmarks add the gradients of the + # GPUs instead of taking their mean, making the gradients effectively + # params.num_gpu times higher. + # TODO(b/62722498): Make all variable updates consistent. + gradient_multiplier = params.num_gpus + + opt = benchmark_cnn.get_optimizer(params, params.init_learning_rate) + grad_vars = opt.compute_gradients( + total_loss, grad_loss=tf.constant(gradient_multiplier, dtype=tf.float32)) + grads = [g for g, _ in grad_vars] + # We apply gradients from a placeholder. That way, we can first compute the + # gradients from each worker, then afterwards apply them one by one by feeding + # them into the placeholder. + placeholder_grad_vars = [(tf.placeholder(g.dtype, g.shape), v) + for g, v in grad_vars] + placeholder_grads = [g for g, _ in placeholder_grad_vars] + apply_grads_op = opt.apply_gradients(placeholder_grad_vars) + + batch_iterators = [_worker_batches_in_numpy_array(numpy_inputs, batch_size, + shift_ratio=i / num_workers) + for i in range(num_workers)] + # Set the GPU count to 0, to avoid taking all the GPU memory. Unfortunately, + # doing so still takes up about ~1GB for some reason. + config = tf.ConfigProto(device_count={'GPU': 0}) + config.graph_options.rewrite_options.pin_to_host_optimization = ( + rewriter_config_pb2.RewriterConfig.OFF) + with tf.Session(config=config) as sess: + sess.run(tf.global_variables_initializer()) + losses = [[] for _ in range(num_workers)] + for i in range(params.num_batches): + computed_grads = [] + for j in range(num_workers): + batch_feed = next(batch_iterators[j]) + batch_feed = batch_feed / 127.5 - 1 + worker_loss, worker_grads = sess.run((reported_loss, grads), + {inputs_placeholder: batch_feed}) + losses[j].append(worker_loss) + computed_grads.append(worker_grads) + for worker_grads in computed_grads: + # TODO(reedwm): With multiple workers, applying the gradients + # sequentially per worker is not equivalent to what tf_cnn_benchmarks + # does when the optmizer is not SGD. Therefore, this currently does not + # work currently when num_workers > 1 and params.optimizer != 'sgd'. + feed_dict = dict(zip(placeholder_grads, worker_grads)) + sess.run(apply_grads_op, feed_dict) + return losses + + +class TestCNNModel(model.CNNModel): + """A simple model used for testing. + + The input is a 1-channel 1x1 image, consisting of a single number. The model + has two scalar variables: A and B, initialized to 1 and 2 respectively. Given + an image x, the loss is defined as: + + loss = x * A * B + """ + + def __init__(self): + super(TestCNNModel, self).__init__( + 'test_cnn_model', image_size=1, batch_size=1, learning_rate=1) + self.depth = 1 + + VAR_A_INITIAL_VALUE = 1. + VAR_B_INITIAL_VALUE = 2. + + def add_inference(self, cnn): + # This model only supports 1x1 images with 1 channel + assert cnn.top_layer.shape[1:] == (1, 1, 1) + # Multiply by variable A. + with tf.name_scope('mult_by_var_A'): + cnn.conv(1, 1, 1, 1, 1, use_batch_norm=None, activation=None, bias=None, + kernel_initializer=tf.constant_initializer( + self.VAR_A_INITIAL_VALUE)) + # Multiply by variable B. + with tf.name_scope('mult_by_var_B'): + cnn.conv(1, 1, 1, 1, 1, use_batch_norm=None, activation=None, bias=None, + kernel_initializer=tf.constant_initializer( + self.VAR_B_INITIAL_VALUE)) + with tf.name_scope('reshape_to_scalar'): + cnn.reshape([-1, 1]) + + def skip_final_affine_layer(self): + return True + + def loss_function(self, inputs, build_network_result): + del inputs + return tf.reduce_mean(build_network_result.logits) + + def manually_compute_losses(self, inputs, num_workers, params): + with tf.Graph().as_default(), tf.device('/cpu:0'): + a = tf.Variable(self.VAR_A_INITIAL_VALUE, name='A') + b = tf.Variable(self.VAR_B_INITIAL_VALUE, name='B') + inputs_placeholder = tf.placeholder(tf.float32, + (None, 1, 1, 1), + name='inputs_placeholder') + inputs_reshaped = tf.reshape(inputs_placeholder, (-1, 1)) + loss = self.loss_function( + None, + model.BuildNetworkResult(logits=inputs_reshaped * a * b, + extra_info=None)) + return manually_compute_losses(inputs, inputs_placeholder, loss, + num_workers, params) + + def accuracy_function(self, inputs, logits): + del inputs + # Let the accuracy be the same as the loss function. + return {'top_1_accuracy': logits, 'top_5_accuracy': logits} + + +class TestDataSet(datasets.ImageDataset): + """A Dataset consisting of 1x1 images with a depth of 1.""" + + def __init__(self, height=1, width=1, depth=1): + super(TestDataSet, self).__init__('test_dataset', height=height, + width=width, depth=depth, data_dir=None, + queue_runner_required=True, num_classes=1) + + def num_examples_per_epoch(self, subset='train'): + del subset + return 1 + + def get_input_preprocessor(self, input_preprocessor='default'): + return preprocessing.TestImagePreprocessor + + def use_synthetic_gpu_inputs(self): + return False diff --git a/cv/classification/resnet50/tensorflow/tf_cnn_benchmarks.py b/cv/classification/resnet50/tensorflow/tf_cnn_benchmarks.py new file mode 100644 index 0000000000000000000000000000000000000000..3014ed7a15a9776572be49a7f5cb5b794504914f --- /dev/null +++ b/cv/classification/resnet50/tensorflow/tf_cnn_benchmarks.py @@ -0,0 +1,80 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Benchmark script for TensorFlow. + +See the README for more information. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import app +from absl import flags as absl_flags +import tensorflow.compat.v1 as tf +import time + +import benchmark_cnn +import cnn_util +import flags +import mlperf +from cnn_util import log_fn + + +flags.define_flags() +for name in flags.param_specs.keys(): + absl_flags.declare_key_flag(name) + +absl_flags.DEFINE_boolean( + 'ml_perf_compliance_logging', False, + 'Print logs required to be compliant with MLPerf. If set, must clone the ' + 'MLPerf training repo https://github.com/mlperf/training and add ' + 'https://github.com/mlperf/training/tree/master/compliance to the ' + 'PYTHONPATH') + + +def main(positional_arguments): + # Command-line arguments like '--distortions False' are equivalent to + # '--distortions=True False', where False is a positional argument. To prevent + # this from silently running with distortions, we do not allow positional + # arguments. + assert len(positional_arguments) >= 1 + if len(positional_arguments) > 1: + raise ValueError('Received unknown positional arguments: %s' + % positional_arguments[1:]) + + params = benchmark_cnn.make_params_from_flags() + try: + from dltest import show_training_arguments + show_training_arguments(flags.FLAGS) + except: + pass + with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, + params.model): + params = benchmark_cnn.setup(params) + bench = benchmark_cnn.BenchmarkCNN(params) + + tfversion = cnn_util.tensorflow_version_tuple() + log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) + + bench.print_info() + bench.run() + + +if __name__ == '__main__': + time.sleep(5) + tf.disable_v2_behavior() + app.run(main) # Raises error on invalid flags, unlike tf.app.run() diff --git a/cv/classification/resnet50/tensorflow/variable_mgr.py b/cv/classification/resnet50/tensorflow/variable_mgr.py new file mode 100644 index 0000000000000000000000000000000000000000..119b0278c0c0a8ac0f49811267554b3db216ef98 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/variable_mgr.py @@ -0,0 +1,839 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines VariableMgr and subclasses used to manage variables. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import contextlib +import re + +import tensorflow.compat.v1 as tf + +import allreduce +import batch_allreduce +import variable_mgr_util + + +class VariableMgr(object): + """Abstract superclass for class used by BenchmarkCNN to control variables. + + Functions on this class are used to control how variables are created and + managed, and how gradients are computed and applied. + """ + + def __init__(self, benchmark_cnn): + self.benchmark_cnn = benchmark_cnn + self.staging_delta_ops = [] + self.use_resource_vars = benchmark_cnn.params.use_resource_vars + + # A variable for automatic loss scaling. + self.grad_has_inf_nan = None + + self._reuse_vars = False + + def each_tower_has_variables(self): + """Returns True if each GPU tower of the model has separate variables.""" + assert False, 'Must be implemented in subclass' + + def supports_staged_vars(self): + """Whether staged variable management is supported.""" + return False + + def create_outer_variable_scope(self, device_num): + """Create the tf.variable_scope around all model graph operations.""" + del device_num # unused by this implementation + assert False, 'Must be implemented in subclass' + + def preprocess_device_grads(self, device_grads): + """Preprocess the device gradients prior to applying them. + + Args: + device_grads: List of lists of (gradient, variable) tuples. + device_grads[t][g] = (gradient, variable), where t is the index of the + tower and g is the index of the gradient-variable pair. + + Returns: a tuple of (apply_gradients_devices, gradient_state). + gradient_state is an opaque structure that should be passed to + get_gradients_to_apply() and append_apply_gradients_ops() (in that order). + apply_gradients_devices is a list of devices where the gradients will be + applied with get_gradients_to_apply() and append_apply_gradients_ops(). + """ + del device_grads # unused by this implementation + assert False, 'Must be implemented in subclass' + + def get_gradients_to_apply(self, device_num, gradient_state): + """Returns the [(gradient, variable)] list to apply for device_num. + + Args: + device_num: indexes into apply_gradients_devices, which was returned by an + earlier call to preprocess_device_grads. + gradient_state: from previous call to apply_gradients_devices. + """ + del device_num, gradient_state # unused by this implementation + assert False, 'Must be implemented in subclass' + + def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops, + loss_scale_params): + """Adds training ops for grads to 'training_ops'. + + + + Args: + gradient_state: from previous call to apply_gradients_devices. + opt: the underlying optimizer + grads: [(grad, var)] to apply + training_ops: list to which to add ops + loss_scale_params: parameters for loss scaling. + """ + del gradient_state # unused by this implementation + + def get_apply_gradients_ops_func(): + """Returns the apply_gradients op.""" + return [opt.apply_gradients(grads)] + + variable_mgr_util.append_gradients_with_loss_scale( + training_ops, get_apply_gradients_ops_func, loss_scale_params, + self.grad_has_inf_nan) + + def get_post_init_ops(self): + """Returns ops that should run post-initialization.""" + return [] + + def get_devices(self): + """Returns devices to use for computation; includes replica selection.""" + assert False, 'Must be implemented in subclass' + + def savable_variables(self): + """Returns a list/dict of savable variables to pass to tf.train.Saver.""" + return tf.global_variables() + + def trainable_variables_on_device(self, + rel_device_num, + abs_device_num, + writable=False): + """Return the set of trainable variables on device. + + Args: + rel_device_num: local worker device index. + abs_device_num: global graph device index. + writable: whether to get a reference to the underlying variable. + + Returns: + The set of trainable variables on the specified device. + """ + del rel_device_num, writable + if self.each_tower_has_variables(): + params = [ + v for v in tf.trainable_variables() + if v.name.startswith('v%s/' % abs_device_num) + ] + else: + params = tf.trainable_variables() + return params + + @contextlib.contextmanager + def reuse_variables(self): + """Context manager that causes variables requested to be reused. + + Variables requested under this context manager must already exist, and will + be reused instead of being created again. This should be used if the + evaluation model is being built after the training model has already been + built. This is because the evaluation model should reuse variables from the + training model. + + Yields: + Nothing. + """ + old_reuse_vars = self._reuse_vars + try: + self._reuse_vars = True + yield + finally: + self._reuse_vars = old_reuse_vars + + +class VariableMgrIndependent(VariableMgr): + """VariableMgr that implements the --independent mode for local jobs. + + Each GPU has its own copy of the variables, and gradients are + not shared between towers. This can be used to check + performance when no data is moved between GPUs. + """ + + def each_tower_has_variables(self): + return True + + def create_outer_variable_scope(self, device_num): + return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars, + use_resource=self.use_resource_vars) + + def preprocess_device_grads(self, device_grads): + return (self.benchmark_cnn.devices, device_grads) + + def get_gradients_to_apply(self, device_num, gradient_state): + device_grads = gradient_state + tower_grad = device_grads[device_num] + + if self.benchmark_cnn.enable_auto_loss_scale and device_num == 0: + # Since we don't aggregate variables in --independent mode, we cannot tell + # if there are NaNs on all GPUs. So we arbitrarily choose to only check + # NaNs on the first GPU. + has_inf_nan_list = [] + for grad, _ in tower_grad: + has_inf_nan_list.append(tf.reduce_all(tf.is_finite(grad))) + self.grad_has_inf_nan = tf.logical_not(tf.reduce_all(has_inf_nan_list)) + + return tower_grad + + def get_devices(self): + return self.benchmark_cnn.raw_devices + + +class VariableMgrLocalFetchFromPS(VariableMgr): + """VariableMgr that implements the --parameter_server mode for local jobs. + + Variables are stored on a parameter server. For each step, each tower gets + a copy of the variables from the parameter server, and sends its gradients + to the param server. + """ + + def each_tower_has_variables(self): + return False + + def create_outer_variable_scope(self, device_num): + return tf.variable_scope('v', reuse=bool(device_num) or self._reuse_vars, + use_resource=self.use_resource_vars) + + def preprocess_device_grads(self, device_grads): + return ([self.benchmark_cnn.param_server_device], device_grads) + + def get_gradients_to_apply(self, device_num, gradient_state): + assert device_num == 0 + device_grads = gradient_state + agg_grads, self.grad_has_inf_nan = ( + variable_mgr_util. + aggregate_gradients_using_copy_with_variable_colocation( + device_grads, + use_mean=True, + check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) + return agg_grads + + def get_devices(self): + raw_devices = self.benchmark_cnn.raw_devices + if self.benchmark_cnn.local_parameter_device_flag == 'gpu': + return [ + variable_mgr_util.ParamServerDeviceSetter(d, raw_devices) + for d in raw_devices + ] + else: + return [ + tf.train.replica_device_setter( + worker_device=d, + ps_device=self.benchmark_cnn.param_server_device, + ps_tasks=1) for d in raw_devices + ] + + +class VariableMgrLocalFetchFromStagedPS(VariableMgrLocalFetchFromPS): + """Implements fetching a local variable through staging buffers. + """ + + def __init__(self, benchmark_cnn): + super(VariableMgrLocalFetchFromStagedPS, self).__init__(benchmark_cnn) + # A data structure to track where the variables are used on each device. + # Indexed by device_num and var_name, each entry stores the "put" and "get" + # ops used for that variable on that device: + # staging_vars_on_devices[device_num][var_name] == (put_op, get_op) + self.staging_vars_on_devices = [ + dict() for _ in self.benchmark_cnn.raw_devices + ] + + def supports_staged_vars(self): + return True + + def create_outer_variable_scope(self, device_num): + self._custom_getter = variable_mgr_util.StagedVariableGetter( + device_num, self.benchmark_cnn.raw_devices, None, self) + return tf.variable_scope( + 'v', reuse=bool(device_num) or self._reuse_vars, + custom_getter=self._custom_getter, use_resource=self.use_resource_vars) + + def trainable_variables_on_device(self, + rel_device_num, + abs_device_num, + writable=False): + return self._custom_getter.trainable_variables_on_device( + rel_device_num, abs_device_num, writable=writable) + + +class VariableMgrLocalReplicated(VariableMgr): + """VariableMgr that implements the --replicated mode for local jobs. + + Each GPU has its own copy of the variables. To apply gradients, + either a local all-reduce algorithm is applied or a regular + cross-device aggregation is used to replicate the combined + gradients to all towers. + """ + + def __init__(self, benchmark_cnn, all_reduce_spec, + agg_small_grads_max_bytes, agg_small_grads_max_group, + allreduce_merge_scope): + super(VariableMgrLocalReplicated, self).__init__(benchmark_cnn) + if all_reduce_spec: + spec = allreduce.parse_all_reduce_spec(all_reduce_spec) + if len(spec) != 1: + raise ValueError( + 'replicated mode does not support hybrid all-reduce strategies') + self._all_reduce_spec = spec[0] + else: + self._all_reduce_spec = None + self._agg_small_grads_max_bytes = agg_small_grads_max_bytes + self._agg_small_grads_max_group = agg_small_grads_max_group + self._warmup_ops = [] + self._allreduce_merge_scope = allreduce_merge_scope + self._gradient_put_ops = None + + def each_tower_has_variables(self): + return True + + def create_outer_variable_scope(self, device_num): + return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars, + use_resource=self.use_resource_vars) + + def preprocess_device_grads(self, device_grads): + compact_grads = (self.benchmark_cnn.params.use_fp16 and + self.benchmark_cnn.params.compact_gradient_transfer) + defer_grads = (self.benchmark_cnn.params.variable_consistency == 'relaxed') + + grads_to_reduce = [[g for g, _ in grad_vars] for grad_vars in device_grads] + algorithm = batch_allreduce.algorithm_from_params(self.benchmark_cnn.params) + reduced_grads, self._warmup_ops = algorithm.batch_all_reduce( + grads_to_reduce, self.benchmark_cnn.params.gradient_repacking, + compact_grads, defer_grads, self.benchmark_cnn.params.xla_compile) + if self.benchmark_cnn.enable_auto_loss_scale: + # Check for infs or nans + is_finite_list = [] + with tf.name_scope('check_for_inf_and_nan'): + for tower_grads in reduced_grads: + with tf.colocate_with(tower_grads[0]): + # TODO(tanmingxing): Create fused op that takes in a list of tensors + # as input and returns scalar boolean True if there are any + # infs/nans. + is_finite_list.append(tf.reduce_all( + [tf.reduce_all(tf.is_finite(g)) for g in tower_grads])) + self.grad_has_inf_nan = tf.logical_not(tf.reduce_all(is_finite_list)) + reduced_device_grads = [[ + (g, v) for g, (_, v) in zip(grads, grad_vars) + ] for grads, grad_vars in zip(reduced_grads, device_grads)] + return self.benchmark_cnn.devices, reduced_device_grads + + def get_gradients_to_apply(self, device_num, gradient_state): + device_grads = gradient_state + return device_grads[device_num] + + def get_post_init_ops(self): + # Copy initialized values for variables on GPU 0 to other GPUs. + global_vars = tf.global_variables() + var_by_name = dict([(v.name, v) for v in global_vars]) + post_init_ops = [] + for v in global_vars: + split_name = v.name.split('/') + # TODO(b/62630508): use more specific prefix than v or v0. + if split_name[0] == 'v0' or not v.name.startswith('v'): + continue + split_name[0] = 'v0' + copy_from = var_by_name['/'.join(split_name)] + post_init_ops.append(v.assign(copy_from.read_value())) + post_init_ops += self._warmup_ops + return post_init_ops + + def savable_variables(self): + """Return the set of variables used for saving/loading the model.""" + params = [] + for v in tf.global_variables(): + split_name = v.name.split('/') + if split_name[0] == 'v0' or not v.name.startswith('v'): + params.append(v) + return params + + def get_devices(self): + return self.benchmark_cnn.raw_devices + + +class VariableMgrDistributedAllReduce(VariableMgr): + """VariableMgr that implements the --distributed_all_reduce mode. + + Each GPU has its own copy of the variables. To apply gradients, + the specified all-reduce algorithm is used to reduce the gradients + and replicate the final value to all GPUs. + """ + + def __init__(self, benchmark_cnn, all_reduce_spec, job_name, + num_workers, agg_small_grads_max_bytes, + agg_small_grads_max_group, allreduce_merge_scope): + super(VariableMgrDistributedAllReduce, self).__init__(benchmark_cnn) + if not all_reduce_spec: + raise ValueError( + 'distributed_all_reduce requires a non-empty all_reduce_spec') + self._all_reduce_spec = allreduce.parse_all_reduce_spec(all_reduce_spec) + self._all_reduce_device_prefixes = ( + allreduce.build_all_reduce_device_prefixes(job_name, num_workers)) + self._num_workers = num_workers + self._agg_small_grads_max_bytes = agg_small_grads_max_bytes + self._agg_small_grads_max_group = agg_small_grads_max_group + self._allreduce_merge_scope = allreduce_merge_scope + if not self._all_reduce_spec: + raise ValueError('all_reduce_spec must be specified') + self._single_session = True + + def each_tower_has_variables(self): + return True + + def create_outer_variable_scope(self, device_num): + """Create a scope for the named device. + + Args: + device_num: index of device for variable scope. (Note that + device_num spans all processes in cluster since a single global + graph is used.) + + Returns: + the requested variable_scope + """ + return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars, + use_resource=self.use_resource_vars) + + def preprocess_device_grads(self, device_grads): + remaining_grads = device_grads + aggregated_grads = [] + for spec_tuple in self._all_reduce_spec: + if spec_tuple.limit < 0: + this_grads = remaining_grads + remaining_grads = [] + else: + (this_grads, remaining_grads) = allreduce.split_grads_by_size( + spec_tuple.limit, remaining_grads) + if this_grads: + range_agg_grads = allreduce.sum_gradients_all_reduce( + self._single_session, + self._all_reduce_device_prefixes, + this_grads, + self._num_workers, + spec_tuple.alg, + spec_tuple.shards, + self.benchmark_cnn.gpu_indices, + agg_small_grads_max_bytes=self._agg_small_grads_max_bytes, + agg_small_grads_max_group=self._agg_small_grads_max_group, + allreduce_merge_scope=self._allreduce_merge_scope) + if not aggregated_grads: + aggregated_grads = range_agg_grads + else: + assert len(aggregated_grads) == len(range_agg_grads) + for i in range(len(aggregated_grads)): + aggregated_grads[i] += range_agg_grads[i] + assert not remaining_grads + full_device_set = [] + for grads in device_grads: + g, v = grads[0] + del v + full_device_set.append(g.device) + return (full_device_set, aggregated_grads) + + def get_gradients_to_apply(self, device_num, gradient_state): + device_grads = gradient_state + if device_num >= len(device_grads): + raise ValueError('device_num %d exceeds length of device_grads (%d)' % + (device_num, len(device_grads))) + return device_grads[device_num] + + def get_post_init_ops(self): + """Copy initialized values for variables to other devices.""" + global_vars = tf.global_variables() + var_by_name = dict([(v.name, v) for v in global_vars]) + post_init_ops = [] + for v in global_vars: + split_name = v.name.split('/') + # TODO(b/62630508): use more specific prefix than v or v0. + if split_name[0] == 'v0' or not v.name.startswith('v'): + continue + split_name[0] = 'v0' + copy_from = var_by_name['/'.join(split_name)] + post_init_ops.append(v.assign(copy_from.read_value())) + return post_init_ops + + def savable_variables(self): + """Return the set of variables used for saving/loading the model.""" + params = [] + for v in tf.global_variables(): + split_name = v.name.split('/') + if split_name[0] == 'v0' or not v.name.startswith('v'): + params.append(v) + return params + + def get_devices(self): + return self.benchmark_cnn.raw_devices + + +# TODO(tucker): Merge this mode with DistributedAllReduce. +class VariableMgrCollectiveAllReduce(VariableMgr): + """VariableMgr that implements the --collective_all_reduce mode. + + Each GPU has its own copy of the variables. To apply gradients + the TF native collective all-reduce op is used to reduce the gradients + and replicate the final value to all GPUs. + """ + + def __init__(self, benchmark_cnn, all_reduce_spec, + num_workers, num_gpus, task_id, allreduce_merge_scope): + super(VariableMgrCollectiveAllReduce, self).__init__(benchmark_cnn) + if not all_reduce_spec: + raise ValueError( + 'collective_all_reduce requires a non-empty all_reduce_spec: %s' + % all_reduce_spec) + parsed_spec = allreduce.parse_all_reduce_spec(all_reduce_spec) + # So far we only support a length-1 all_reduce_spec + if len(parsed_spec) > 1 or parsed_spec[0].limit > 0: + raise ValueError( + 'collective_all_reduce requires one single-range all_reduce_spec %s' + % parsed_spec) + self._all_reduce_spec = parsed_spec[0] + if self._all_reduce_spec.alg != 'collective': + raise ValueError( + 'VariableMgrCollectiveAllReduce initialized with non-collective ' + 'all_reduce_spec %s' % self.all_reduce_spec) + self._num_workers = num_workers + self._num_gpus = num_gpus + self._task_id = task_id + self._allreduce_merge_scope = allreduce_merge_scope + self._instance_key_counter = 10000 + self._instance_key_table = dict() + self._single_session = False + # List of prefixes for generating PS devices, unused here. + self._all_reduce_device_prefixes = None + + def each_tower_has_variables(self): + return True + + def create_outer_variable_scope(self, device_num): + """Create a scope for the named device. + + Args: + device_num: index of device for variable scope. + + Returns: + the requested variable_scope + """ + return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars) + + def preprocess_device_grads(self, device_grads): + reduced_grads = allreduce.sum_gradients_all_reduce( + self._single_session, + self._all_reduce_device_prefixes, + device_grads, + self._num_workers, + 'collective', + self._all_reduce_spec.shards, + self.benchmark_cnn.gpu_indices, + allreduce_merge_scope=self._allreduce_merge_scope) + assert len(reduced_grads) == len(device_grads) + full_device_set = [] + for grads in device_grads: + g, _ = grads[0] + full_device_set.append(g.device) + return (full_device_set, reduced_grads) + + def get_gradients_to_apply(self, device_num, gradient_state): + device_grads = gradient_state + if device_num >= len(device_grads): + raise ValueError('device_num %d exceeds length of device_grads (%d)' % + (device_num, len(device_grads))) + return device_grads[device_num] + + def _get_instance_key(self, name): + if name not in self._instance_key_table.keys(): + self._instance_key_counter += 1 + self._instance_key_table[name] = self._instance_key_counter + return self._instance_key_table[name] + + def get_post_init_ops(self): + """Broadcast initialized values of variables to other devices. + + Returns: + At task 0 device 0, broadcast_send. + At all other devices and tasks, broadcast_recv. + """ + global_vars = tf.global_variables() + group_size = self._num_workers * self._num_gpus + post_init_ops = [] + # Gather variables into same-var-different-device groups. + vars_by_suffix = dict() + for v in global_vars: + split_name = v.name.split('/') + mo = re.match(r'v(\d+)$', split_name[0]) + if mo: + device_id = int(mo.group(1)) + suffix = '/'.join(split_name[1:]) + if suffix in vars_by_suffix.keys(): + vars_by_suffix[suffix].append(v) + else: + vars_by_suffix[suffix] = [v] + # Generate broadcast ops for each such group. + for suffix in sorted(vars_by_suffix): + vlist = vars_by_suffix[suffix] + assert self._num_gpus == len(vlist) + devices = [v.device for v in vlist] + # NOTE: this key should generate the same value for all tasks + group_key = allreduce.collective_group_key(devices) + group_size = self._num_workers * len(devices) + instance_key = self._get_instance_key(suffix) + for v in vlist: + split_name = v.name.split('/') + mo = re.match(r'v(\d+)$', split_name[0]) + if mo: + device_id = int(mo.group(1)) + if (self._task_id == 0 and device_id == 0): + with tf.device(v.device): + bcast_send = allreduce.broadcast_send( + v, v.shape, v.dtype, group_size, group_key, instance_key) + post_init_ops.append(v.assign(bcast_send)) + else: + with tf.device(v.device): + bcast_recv = allreduce.broadcast_recv( + v.shape, v.dtype, group_size, group_key, instance_key) + post_init_ops.append(v.assign(bcast_recv)) + return post_init_ops + + def savable_variables(self): + """Return the set of variables used for saving/loading the model.""" + params = [] + if self._task_id == 0: + for v in tf.global_variables(): + split_name = v.name.split('/') + if split_name[0] == 'v0' or not v.name.startswith('v'): + params.append(v) + return params + + def get_devices(self): + return self.benchmark_cnn.raw_devices + + +class VariableMgrDistributedFetchFromPS(VariableMgr): + """Implements --variable_update=parameter_server mode for distributed jobs. + + Variables are stored on a parameter server. For each step, each tower gets + a copy of the variables from the parameter server, and sends its gradients + to the param server. + """ + + def each_tower_has_variables(self): + return False + + def create_outer_variable_scope(self, device_num): + if self.benchmark_cnn.local_parameter_device_flag == 'gpu': + caching_devices = self.benchmark_cnn.raw_devices + else: + caching_devices = [self.benchmark_cnn.cpu_device] + custom_getter = variable_mgr_util.OverrideCachingDevice( + caching_devices, self.benchmark_cnn.cpu_device, 1024 * 64) + return tf.variable_scope( + 'v', reuse=bool(device_num) or self._reuse_vars, + custom_getter=custom_getter, use_resource=self.use_resource_vars) + + def preprocess_device_grads(self, device_grads): + # Returns (gradient_devices, gradient_state) + return ([self.benchmark_cnn.param_server_device], device_grads) + + def get_gradients_to_apply(self, device_num, gradient_state): + assert device_num == 0 + agg_grads, self.grad_has_inf_nan = ( + variable_mgr_util.aggregate_gradients_using_copy( + gradient_state, + use_mean=True, + check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) + return agg_grads + + def get_devices(self): + ps_strategy = variable_mgr_util.GreedyLoadBalancingStrategy( + self.benchmark_cnn.num_ps, variable_mgr_util.byte_size_load_fn) + return [ + tf.train.replica_device_setter( + worker_device=d, + cluster=self.benchmark_cnn.cluster_manager.get_cluster_spec(), + ps_strategy=ps_strategy) for d in self.benchmark_cnn.raw_devices + ] + + +class VariableMgrDistributedFetchFromStagedPS( + VariableMgrDistributedFetchFromPS): + """Extends VariableMgrDistributedFetchFromPS for --staged_vars.""" + + def __init__(self, benchmark_cnn): + super(VariableMgrDistributedFetchFromStagedPS, self).__init__(benchmark_cnn) + self.staging_vars_on_devices = [ + dict() for _ in self.benchmark_cnn.raw_devices + ] + self.staged_vars_on_cpu = {} + + def create_outer_variable_scope(self, device_num): + self._custom_getter = variable_mgr_util.StagedVariableGetter( + device_num, self.benchmark_cnn.raw_devices, + self.benchmark_cnn.cpu_device, self) + return tf.variable_scope( + 'v', reuse=bool(device_num) or self._reuse_vars, + custom_getter=self._custom_getter, use_resource=self.use_resource_vars) + + def supports_staged_vars(self): + return True + + def trainable_variables_on_device(self, + rel_device_num, + abs_device_num, + writable=False): + return self._custom_getter.trainable_variables_on_device( + rel_device_num, abs_device_num, writable=writable) + + +class VariableMgrDistributedReplicated(VariableMgr): + """VariableMgr that implements the --distributed_replicated mode. + + Each GPU has a copy of the variables, and updates its copy after the + parameter servers are all updated with the gradients from all servers. Only + works with cross_replica_sync=true. Unlike 'replicated', does not use nccl + all-reduce for replicating within a server. + """ + + def each_tower_has_variables(self): + return True + + def create_outer_variable_scope(self, device_num): + return tf.variable_scope( + 'v%s' % device_num, reuse=self._reuse_vars, + custom_getter=variable_mgr_util.OverrideToLocalVariableIfNotPsVar(), + use_resource=self.use_resource_vars) + + def preprocess_device_grads(self, device_grads): + return ([self.benchmark_cnn.param_server_device], device_grads) + + def get_gradients_to_apply(self, device_num, gradient_state): + device_grads = gradient_state # From 2nd result of preprocess_device_grads. + + avg_grads, self.grad_has_inf_nan = ( + variable_mgr_util.aggregate_gradients_using_copy_with_device_selection( + self.benchmark_cnn, + device_grads, + use_mean=True, + check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale)) + + # Make shadow variable on a parameter server for each original trainable + # variable. + for i, (g, v) in enumerate(avg_grads): + my_name = variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/' + v.name + if my_name.endswith(':0'): + my_name = my_name[:-2] + new_v = tf.get_variable( + my_name, + dtype=v.dtype.base_dtype, + initializer=v.initial_value, + trainable=True) + avg_grads[i] = (g, new_v) + return avg_grads + + def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops, + loss_scale_params): + device_grads = gradient_state # From 2nd result of preprocess_device_grads. + + def get_apply_gradients_ops_func(): + """Returns a list of ops for updating gradients.""" + apply_gradients_ops = [] + # For each variable, apply the combined gradients for this server on + # the parameter server, and then wait for all other servers to do this. + for i, (g, v) in enumerate(grads): + apply_gradient_op = opt.apply_gradients([(g, v)]) + barrier = self.benchmark_cnn.add_sync_queues_and_barrier( + 'replicate_variable_%s' % i, [apply_gradient_op]) + with tf.control_dependencies([barrier]): + with tf.device(self.benchmark_cnn.cpu_device): + updated_value = v.read_value() + for my_d in range(len(self.benchmark_cnn.devices)): + apply_gradients_ops.append( + device_grads[my_d][i][1].assign(updated_value)) + return apply_gradients_ops + + variable_mgr_util.append_gradients_with_loss_scale( + training_ops, get_apply_gradients_ops_func, loss_scale_params, + self.grad_has_inf_nan) + + def _strip_port(self, s): + if s.endswith(':0'): + return s[:-2] + return s + + def get_post_init_ops(self): + # Copy initialized variables for variables on the parameter server + # to the local copy of the variable. + + local_vars = tf.local_variables() + local_var_by_name = dict( + [(self._strip_port(v.name), v) for v in local_vars]) + post_init_ops = [] + for v in tf.global_variables(): + if v.name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0/'): + prefix = self._strip_port( + v.name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'):]) + for i in range(self.benchmark_cnn.num_gpus): + name = 'v%s%s' % (i, prefix) + if name in local_var_by_name: + copy_to = local_var_by_name[name] + post_init_ops.append(copy_to.assign(v.read_value())) + return post_init_ops + + def _remove_shadow_var_prefix_if_present(self, var_name): + if var_name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'): + return var_name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):] + else: + return var_name + + def var_dict_name(self, v): + return self._strip_port(self._remove_shadow_var_prefix_if_present(v.name)) + + def savable_variables(self): + """Returns a list/dict of savable variables to pass to tf.train.Saver.""" + params = {} + for v in tf.global_variables(): + assert (v.name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0/') + or v.name in ('global_step:0', 'loss_scale:0', + 'loss_scale_normal_steps:0')), ( + 'Invalid global variable: %s' % v) + # We store variables in the checkpoint with the shadow variable prefix + # removed so we can evaluate checkpoints in non-distributed replicated + # mode. The checkpoints can also be loaded for training in + # distributed_replicated mode. + name = self._strip_port(self._remove_shadow_var_prefix_if_present(v.name)) + params[name] = v + for v in tf.local_variables(): + # Non-trainable variables, such as batch norm moving averages, do not have + # corresponding global shadow variables, so we add them here. Trainable + # local variables have corresponding global shadow variables, which were + # added in the global variable loop above. + if v.name.startswith('v0/') and v not in tf.trainable_variables(): + params[self._strip_port(v.name)] = v + return params + + def get_devices(self): + return self.benchmark_cnn.raw_devices diff --git a/cv/classification/resnet50/tensorflow/variable_mgr_util.py b/cv/classification/resnet50/tensorflow/variable_mgr_util.py new file mode 100644 index 0000000000000000000000000000000000000000..94ce3e4b7c48d49797802f3dfadbaf0d4108d902 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/variable_mgr_util.py @@ -0,0 +1,676 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utilities for VariableMgr.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections as pycoll +import operator + +import numpy as np +import tensorflow.compat.v1 as tf + +# pylint: disable=g-direct-tensorflow-import +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import data_flow_ops +from tensorflow.python.ops import math_ops + + +PS_SHADOW_VAR_PREFIX = 'ps_var' + +AutoLossScaleParams = pycoll.namedtuple( + 'AutoLossScaleParams', + [ + # If true, enable automatic loss scaling. + 'enable_auto_loss_scale', + # The value to scale the loss before computing gradients. + 'loss_scale', + # Number of normal steps with the current `loss_scale`. + 'loss_scale_normal_steps', + # Increase loss scale every n steps. + 'inc_loss_scale_every_n', + # If true, the current worker is chief. The current implementation + # relies on the chief to update loss_scale value, but in future, we + # might change this to ask the parameter server to update loss_scales + # for better performance. + # TODO(tanmingxing): remove this if loss_scale is updated in ps. + 'is_chief', + ]) + + +def get_loss_scale_update_op(loss_scale, loss_scale_normal_steps, + inc_loss_scale_every_n): + """Returns the update op for loss scaling variables. + + We maintain the counter `loss_scale_normal_steps` to count the number of steps + we have been using the current `loss_scale`. In most cases, this function + increments `loss_scale_normal_steps`. However, if `loss_scale_normal_steps` is + greater than the threshold `inc_loss_scale_every_n`, we double `loss_scale` + and reset `loss_scale_normal_steps` to zero. + + This op is only called if the gradients don't have any infs or nans. Instead, + if infs or nans occur in the gradients, we immeditately halve `loss_scale` and + reset `loss_scale_normal_steps` to zero. + + Args: + loss_scale: a tf.Variable represneting the loss_scale value. + loss_scale_normal_steps: a tf.Variable representing the number of training + steps that have run since the loss_scale last changed. + inc_loss_scale_every_n: a Python integer threshold. `loss_scale` is + increased every `inc_loss_scale_every_n` steps, unless the gradients have + infs or nans. + + Returns: + An op for updating `loss_scale` and `loss_scale_normal_steps`. + """ + + def increment_loss_scale_normal_steps_func(): + return tf.group(loss_scale_normal_steps.assign_add(1)) + + def increase_loss_scale_func(): + return tf.group( + tf.assign(loss_scale_normal_steps, 0), + tf.assign(loss_scale, loss_scale * 2)) + + # true_fn and false_fn must have the same type. + return tf.cond(loss_scale_normal_steps < inc_loss_scale_every_n, + increment_loss_scale_normal_steps_func, + increase_loss_scale_func) + + +def append_gradients_with_loss_scale(training_ops, get_apply_gradients_ops_func, + loss_scale_params, grad_has_inf_nan): + """Selectively appends gradients update ops with loss scaling. + + Args: + training_ops: a list of training ops to be executed. + get_apply_gradients_ops_func: a function that returns a list of ops for + applying gradients. Here, we must pass a function instead of the actual + list of ops; otherwise, those ops would be executed unconditionally due to + the semantics of tf.cond. + loss_scale_params: An AutoLossScaleParams tuple. + grad_has_inf_nan: Boolean tensor indicating whether the gradients have infs + or nans. + """ + is_chief = loss_scale_params.is_chief + loss_scale = loss_scale_params.loss_scale + loss_scale_normal_steps = loss_scale_params.loss_scale_normal_steps + inc_loss_scale_every_n = loss_scale_params.inc_loss_scale_every_n + enable_auto_loss_scale = loss_scale_params.enable_auto_loss_scale + + if loss_scale is None or not enable_auto_loss_scale or not is_chief: + training_ops.extend(get_apply_gradients_ops_func()) + else: + # If nans/infs occurred, skip applying gradients and instead update + # loss_scale (halve loss_scale and reset loss_scale_normal_steps to zero). + def update_op_if_nan_or_inf(): + """Update loss_scale and discard gradients if nans/infs occurred.""" + return tf.group( + tf.assign(loss_scale, loss_scale / 2.), + tf.assign(loss_scale_normal_steps, 0)) + + # Otherwise, apply gradients, and update loss_scale and + # loss_scale_normal_steps. + def update_op_if_no_nan_or_inf(): + """Apply gradients, and update loss scaling.""" + return tf.group( + get_loss_scale_update_op(loss_scale, loss_scale_normal_steps, + inc_loss_scale_every_n), + *get_apply_gradients_ops_func()) + + # TODO(tanmingxing): Add support for independent and distributed all_reduce. + assert grad_has_inf_nan is not None + update_op = tf.cond( + grad_has_inf_nan, + update_op_if_nan_or_inf, + update_op_if_no_nan_or_inf, + name='cond_if_grad_has_inf_nan' + ) + training_ops.append(update_op) + + +# To be used with custom_getter on tf.get_variable. +class OverrideCachingDevice(object): + """Variable getter which caches variables on the least loaded device. + + Variables smaller than a certain threshold are cached on a single specific + device, as specified in the constructor. All other variables are load balanced + across a pool of devices, by caching each variable on the least loaded device. + + Note that variable creation only happen when building the model graph on the + first device (see how it sets the 'reuse' parameter in + VariableMgr.*.create_outer_variable_scope()). That means, for all other + devices, the variable scope will reuse the variables created before, which + requires that we set the caching_device correctly as otherwise it may not be + able to find the previously created variable and will create a new one. This + requires when building the model graph on different devices, variables with + the same name should have same size. + + TODO(laigd): consider adding tests or verification logic to enforce this, or + refactor it. + """ + + def __init__(self, devices, device_for_small_variables, + small_variable_size_threshold): + self.devices = devices + self.sizes = [0] * len(self.devices) + self.device_for_small_variables = device_for_small_variables + self.small_variable_size_threshold = small_variable_size_threshold + + def __call__(self, getter, *args, **kwargs): + size = tf.TensorShape(kwargs['shape']).num_elements() + if size < self.small_variable_size_threshold: + device_name = self.device_for_small_variables + else: + device_index, _ = min(enumerate(self.sizes), key=operator.itemgetter(1)) + device_name = self.devices[device_index] + self.sizes[device_index] += size + + kwargs['caching_device'] = device_name + var = getter(*args, **kwargs) + return var + + +# To be used with custom_getter on tf.get_variable. Ensures the created variable +# is in LOCAL_VARIABLES and not GLOBAL_VARIBLES collection. +class OverrideToLocalVariableIfNotPsVar(object): + + # args and kwargs come from the custom_getter interface for Tensorflow + # variables, and matches tf.get_variable's signature, with the addition of + # 'getter' at the beginning. + def __call__(self, getter, name, *args, **kwargs): + if name.startswith(PS_SHADOW_VAR_PREFIX): + return getter(*args, **kwargs) + + if 'collections' in kwargs: + collections = kwargs['collections'] + if not collections: + collections = [tf.GraphKeys.GLOBAL_VARIABLES] + else: + collections = collections[:] + collections.remove(tf.GraphKeys.GLOBAL_VARIABLES) + collections.append(tf.GraphKeys.LOCAL_VARIABLES) + kwargs['collections'] = list(collections) + return getter(name, *args, **kwargs) + + +class ParamServerDeviceSetter(object): + """Helper class to assign variables on the least loaded ps-device.""" + + def __init__(self, worker_device, ps_devices): + """Initializer for ParamServerDevicSetter. + + Args: + worker_device: the device to use for computer ops. + ps_devices: a list of device to use for Variable ops. Each variable is + assigned to the least loaded device. + """ + self.ps_devices = ps_devices + self.worker_device = worker_device + self.ps_sizes = [0] * len(self.ps_devices) + + def __call__(self, op): + if op.device: + return op.device + if op.type not in ['Variable', 'VariableV2']: + return self.worker_device + + device_index, _ = min(enumerate(self.ps_sizes), key=operator.itemgetter(1)) + device_name = self.ps_devices[device_index] + var_size = op.outputs[0].get_shape().num_elements() + self.ps_sizes[device_index] += var_size + + return device_name + + +class StagedModelVariable(object): + """Staging variable wrapper that decouples reads and updates. + + This class represents a variable through a staging buffer. Reads from this + variable directly gets from the staging buffer. Updates are stacked into + another staging buffer, and will be processed later. + """ + + def __init__(self, real_var, var_stage_get, variable_mgr): + """Initializer for the model variables through a staging buffer. + + Args: + real_var: the underlying real variable. + var_stage_get: the read op from the staging buffer. + variable_mgr: the parent variable-manager. + """ + self.real_var = real_var + self.var_stage_get = var_stage_get + self.variable_mgr = variable_mgr + + def _value(self): + """The read access of this variable. The content from the staging buffer.""" + return self.var_stage_get + + def _ref(self): + """Return the underlying variable ref, required by tf.colocate_with.""" + return self.real_var._ref() # pylint: disable=protected-access + + def read_value(self): + """Mimics tf.Variable.read_value().""" + return tf.identity(self.var_stage_get, name='read') + + @property + def dtype(self): + """Return the non-reference dtype.""" + return self.var_stage_get.dtype + + def assign_sub(self, delta, name=None, read_value=True): + """Mimic the updates to the variable. + + Args: + delta: is pushed into a staging buffer and will be pumped later. + name: currently ignored; names of ops and the StagingArea are + computed without using this pass name. + read_value: if True, will return something which evaluates to the new + value of the variable; if False will return the assign op. + Returns: + The actual updates. The colocation constraint will be reapplied. + """ + # This parameter is ignored: the StagingArea only supports setting + # the shared name, not the names of individual ops it uses. + del name + + # colocate_with(None, True) clears the colocation constraints. + # Push the delta into a staging buffer. + with ops.colocate_with(None, True), tf.device(self.var_stage_get.device): + delta_staging_area = data_flow_ops.StagingArea( + [self.var_stage_get.dtype], shapes=[self.var_stage_get.shape]) + delta_put_op = delta_staging_area.put([delta]) + self.variable_mgr.staging_delta_ops.append(delta_put_op) + delta_get_op = delta_staging_area.get()[0] + # Return the actual updates. The colocation constraint will be reapplied. + return self.real_var.assign_sub(delta_get_op, read_value=read_value) + + @staticmethod + # pylint: disable=bad-staticmethod-argument,invalid-name + def _TensorConversionFunction(self, dtype=None, name=None, as_ref=False): + """Utility function for converting a StagedModelVariable to a Tensor.""" + del dtype, name # unused: this function returns the cached ref or value. + if as_ref: + return self._ref() + else: + return self._value() + + +ops.register_tensor_conversion_function( + StagedModelVariable, StagedModelVariable._TensorConversionFunction) # pylint: disable=protected-access + + +class StagedVariableGetter(object): + """A variable getter through staging buffers on devices. + + Instead of a caching device, this getter tracks where the variable is used. + And on each device, it goes through a staging buffer. + """ + + def __init__(self, device_num, devices, cpu_device, variable_mgr): + """Initializer for StagedVariableGetter. + + Args: + device_num: the current device index. + devices: a list of all the devices to build towers. + cpu_device: a cpu_device for this replica. If None, no cpu-caching is + done. + variable_mgr: the parent variable manager. + """ + self.device_num = device_num + self.devices = devices + self.cpu_device = cpu_device + self.variable_mgr = variable_mgr + + def __call__(self, getter, name, *args, **kwargs): + staging_ops = self.variable_mgr.staging_vars_on_devices[self.device_num] + if name in staging_ops: + put_op, get_op = staging_ops[name] + return get_op + real_var = getter(name, *args, **kwargs) + shape = kwargs['shape'] + dtype = kwargs['dtype'] + trainable = kwargs['trainable'] + if self.cpu_device: + with tf.device(self.cpu_device): + # This helps copying the weights from the parameter to this server only + # once. + if name in self.variable_mgr.staged_vars_on_cpu: + cpu_var = self.variable_mgr.staged_vars_on_cpu[name] + else: + cpu_var = tf.identity(real_var) + self.variable_mgr.staged_vars_on_cpu[name] = cpu_var + var_to_stage = cpu_var + else: + var_to_stage = tf.identity(real_var) # de-reference the variable. + + with tf.device(self.devices[self.device_num]): + staging_area = data_flow_ops.StagingArea([dtype], shapes=[shape]) + put_op = staging_area.put([var_to_stage]) + get_op = staging_area.get()[0] + staging_ops[name] = (put_op, get_op) + if trainable: + # For trainable variables, they are managed separatedly through + # apply_gradients. + return get_op + else: + # For other shadow variables, the access is decoupled through a wrapper + # class. + return StagedModelVariable(real_var, get_op, self.variable_mgr) + + def trainable_variables_on_device(self, rel_device_num, abs_device_num, + writable): + """Return the set of trainable variables on the specified device. + + Args: + rel_device_num: local worker device index. + abs_device_num: global graph device index. + writable: whether the returned variables is writable or read-only. + + Returns: + Return the set of trainable variables on the specified device. + """ + del abs_device_num + params_refs = tf.trainable_variables() + if writable: + return params_refs + params = [] + for param in params_refs: + var_name = param.name.split(':')[0] + _, var_get_op = self.variable_mgr.staging_vars_on_devices[rel_device_num][ + var_name] + params.append(var_get_op) + return params + + +def aggregate_gradients_using_copy_with_device_selection( + benchmark_cnn, tower_grads, use_mean, check_inf_nan): + """Aggregate gradients, controlling device for the aggregation. + + Args: + benchmark_cnn: benchmark_cnn class. + tower_grads: List of lists of (gradient, variable) tuples. The outer list + is over towers. The inner list is over individual gradients. + use_mean: if True, mean is taken, else sum of gradients is taken. + check_inf_nan: If true, check grads for nans and infs. + + Returns: + The tuple ([(average_gradient, variable),], has_nan_or_inf) where the + gradient has been averaged across all towers. The variable is chosen from + the first tower. The has_nan_or_inf indicates the grads has nan or inf. + """ + if benchmark_cnn.local_parameter_device_flag == 'gpu': + avail_devices = benchmark_cnn.raw_devices + else: + avail_devices = [benchmark_cnn.param_server_device] + agg_grads = [] + has_nan_or_inf_list = [] + for i, single_grads in enumerate(zip(*tower_grads)): + with tf.device(avail_devices[i % len(avail_devices)]): + grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy( + single_grads, use_mean, check_inf_nan) + agg_grads.append(grad_and_var) + has_nan_or_inf_list.append(has_nan_or_inf) + if check_inf_nan: + return agg_grads, tf.reduce_any(has_nan_or_inf_list) + else: + return agg_grads, None + + +def aggregate_gradients_using_copy_with_variable_colocation( + tower_grads, use_mean, check_inf_nan): + """Aggregate gradients, colocating computation with the gradient's variable. + + Args: + tower_grads: List of lists of (gradient, variable) tuples. The outer list + is over towers. The inner list is over individual gradients. All variables + of the same gradient across towers must be the same (that is, + tower_grads[x][a][1] == tower_grads[y][a][1] for all indices x, y, and a) + use_mean: if True, mean is taken, else sum of gradients is taken. + check_inf_nan: If true, check grads for nans and infs. + + Returns: + The tuple ([(average_gradient, variable),], has_nan_or_inf) where the + gradient has been averaged across all towers. The variable is chosen from + the first tower. The has_nan_or_inf indicates the grads has nan or inf. + """ + agg_grads = [] + has_nan_or_inf_list = [] + for single_grads in zip(*tower_grads): + # Note that each single_grads looks like the following: + # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) + var = single_grads[0][1] + + for _, v in single_grads: + assert v == var + + with tf.device(var.device): + grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy( + single_grads, use_mean, check_inf_nan) + agg_grads.append(grad_and_var) + has_nan_or_inf_list.append(has_nan_or_inf) + + if check_inf_nan: + return agg_grads, tf.reduce_any(has_nan_or_inf_list) + else: + return agg_grads, None + + +def aggregate_gradients_using_copy(tower_grads, use_mean, check_inf_nan): + """Calculate the average gradient for each shared variable across all towers. + + Note that this function provides a synchronization point across all towers. + + Args: + tower_grads: List of lists of (gradient, variable) tuples. The outer list + is over towers. The inner list is over individual gradients. + use_mean: if True, mean is taken, else sum of gradients is taken. + check_inf_nan: check grads for nans and infs. + + Returns: + The tuple ([(average_gradient, variable),], has_nan_or_inf) where the + gradient has been averaged across all towers. The variable is chosen from + the first tower. The has_nan_or_inf indicates the grads has nan or inf. + """ + agg_grads = [] + has_nan_or_inf_list = [] + + for single_grads in zip(*tower_grads): + grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy( + single_grads, use_mean, check_inf_nan) + agg_grads.append(grad_and_var) + has_nan_or_inf_list.append(has_nan_or_inf) + + if check_inf_nan: + return agg_grads, tf.reduce_any(has_nan_or_inf_list) + else: + return agg_grads, None + + +# The following two functions are copied from +# tensorflow/python/eager/backprop.py. We do not directly use them as they are +# not exported and subject to change at any time. +def flatten_nested_indexed_slices(grad): + assert isinstance(grad, ops.IndexedSlices) + if isinstance(grad.values, ops.Tensor): + return grad + else: + assert isinstance(grad.values, ops.IndexedSlices) + g = flatten_nested_indexed_slices(grad.values) + return ops.IndexedSlices(g.values, array_ops.gather(grad.indices, + g.indices), + g.dense_shape) + + +def aggregate_indexed_slices_gradients(grads): + """Aggregates gradients containing `IndexedSlices`s.""" + if len(grads) < 1: + return None + elif len(grads) == 1: + return grads[0] + else: + grads = [g for g in grads if g is not None] + # If any gradient is a `Tensor`, sum them up and return a dense tensor + # object. + if any(isinstance(g, ops.Tensor) for g in grads): + return math_ops.add_n(grads) + + # The following `_as_indexed_slices_list` casts ids of IndexedSlices into + # int64. It is to make sure the inputs of `concat` all have same the data + # type. + grads = math_ops._as_indexed_slices_list(grads) # pylint: disable=protected-access + + grads = [flatten_nested_indexed_slices(x) for x in grads] + # Form IndexedSlices out of the concatenated values and indices. + concat_grad = ops.IndexedSlices( + array_ops.concat([x.values for x in grads], axis=0), + array_ops.concat([x.indices for x in grads], axis=0), + grads[0].dense_shape) + + return concat_grad + + +def aggregate_single_gradient_using_copy(grad_and_vars, use_mean, + check_inf_nan): + """Calculate the average gradient for a shared variable across all towers. + + Note that this function provides a synchronization point across all towers. + + Args: + grad_and_vars: A list or tuple of (gradient, variable) tuples. Each + (gradient, variable) pair within the outer list represents the gradient + of the variable calculated for a single tower, and the number of pairs + equals the number of towers. + use_mean: if True, mean is taken, else sum of gradients is taken. + check_inf_nan: check grads for nans and infs. + + Returns: + The tuple ([(average_gradient, variable),], has_nan_or_inf) where the + gradient has been averaged across all towers. The variable is chosen from + the first tower. The has_nan_or_inf indicates the grads has nan or inf. + """ + grads = [g for g, _ in grad_and_vars] + if any(isinstance(g, tf.IndexedSlices) for g in grads): + # TODO(reedwm): All-reduce IndexedSlices more effectively. + grad = aggregate_indexed_slices_gradients(grads) + else: + grad = tf.add_n(grads) + + if use_mean and len(grads) > 1: + grad = tf.scalar_mul(1.0 / len(grads), grad) + + v = grad_and_vars[0][1] + if check_inf_nan: + with tf.name_scope('check_for_inf_and_nan'): + has_nan_or_inf = tf.logical_not(tf.reduce_all(tf.is_finite(grads))) + return (grad, v), has_nan_or_inf + else: + return (grad, v), None + + +# This class is copied from +# https://github.com/tensorflow/tensorflow/blob/590d6eef7e91a6a7392c8ffffb7b58f2e0c8bc6b/tensorflow/contrib/training/python/training/device_setter.py#L56. +# We copy it since contrib has been removed from TensorFlow. +class GreedyLoadBalancingStrategy(object): + """Returns the least-loaded ps task for op placement. + + The load is calculated by a user-specified load function passed in at + construction. There are no units for load, and the load function is + responsible for providing an internally consistent measure. + + Note that this strategy is very sensitive to the exact order in which + ps ops (typically variables) are created, as it greedily places ops + on the least-loaded ps at the point each op is processed. + + One reasonable heuristic is the `byte_size_load_fn`, which + estimates load as the number of bytes that would be used to store and + transmit the entire variable. More advanced load functions + could consider the difference in access patterns across ops, or trade + off CPU-intensive ops with RAM-intensive ops with network bandwidth. + + This class is intended to be used as a `ps_strategy` in + `tf.compat.v1.train.replica_device_setter`. + """ + + def __init__(self, num_tasks, load_fn): + """Create a new `LoadBalancingStrategy`. + + Args: + num_tasks: Number of ps tasks to cycle among. + load_fn: A callable that takes an `Operation` and returns a + numeric load value for that op. + """ + self._num_tasks = num_tasks + self._load_fn = load_fn + self._ps_loads = np.zeros(num_tasks) + + def __call__(self, op): + """Choose a ps task index for the given `Operation`. + + Args: + op: A `Operation` to be placed on ps. + + Returns: + The next ps task index to use for the `Operation`. Greedily + places the op on the least-loaded ps task so far, as determined + by the load function. + """ + task = np.argmin(self._ps_loads) + self._ps_loads[task] += self._load_fn(op) + return task + + +# This function is copied from +# https://github.com/tensorflow/tensorflow/blob/590d6eef7e91a6a7392c8ffffb7b58f2e0c8bc6b/tensorflow/contrib/training/python/training/device_setter.py#L105. +# We copy it since contrib has been removed from TensorFlow. +def byte_size_load_fn(op): + """Load function that computes the byte size of a single-output `Operation`. + + This is intended to be used with `"Variable"` ops, which have a single + `Tensor` output with the contents of the variable. However, it can also be + used for calculating the size of any op that has a single output. + + Intended to be used with `GreedyLoadBalancingStrategy`. + + Args: + op: An `Operation` with a single output, typically a "Variable" op. + + Returns: + The number of bytes in the output `Tensor`. + + Raises: + ValueError: if `op` does not have a single output, or if the shape of the + single output is not fully-defined. + """ + if len(op.outputs) != 1: + raise ValueError('Op %s must have a single output' % op) + output = op.outputs[0] + elem_size = output.dtype.size + shape = output.get_shape() + if not shape.is_fully_defined(): + # Due to legacy behavior, scalar "Variable" ops have output Tensors that + # have unknown shape when the op is created (and hence passed to this + # load function for placement), even though the scalar shape is set + # explicitly immediately afterward. + shape = tensor_shape.TensorShape(op.get_attr('shape')) + shape.assert_is_fully_defined() + return shape.num_elements() * elem_size + diff --git a/cv/classification/resnet50/tensorflow/variable_mgr_util_test.py b/cv/classification/resnet50/tensorflow/variable_mgr_util_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0915224f9681ab34daee03e01d12852b15d95298 --- /dev/null +++ b/cv/classification/resnet50/tensorflow/variable_mgr_util_test.py @@ -0,0 +1,153 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for variable_mgr_util.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf +import variable_mgr_util + + +class VariableMgrUtilTest(tf.test.TestCase): + + def testGetLossScaleUpdateOpTruePath(self): + loss_scale = tf.Variable(4) + # loss_scale_normal_steps >= inc_loss_scale_every_n + loss_scale_normal_steps = tf.Variable(10) + inc_loss_scale_every_n = 10 + update_op = variable_mgr_util.get_loss_scale_update_op( + loss_scale, loss_scale_normal_steps, inc_loss_scale_every_n) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(update_op) + + self.assertEqual(sess.run(loss_scale), 8) + self.assertEqual(sess.run(loss_scale_normal_steps), 0) + + def testGetLossScaleUpdateOpFalsePath(self): + loss_scale = tf.Variable(4) + # loss_scale_normal_steps < inc_loss_scale_every_n + loss_scale_normal_steps = tf.Variable(9) + inc_loss_scale_every_n = 10 + update_op = variable_mgr_util.get_loss_scale_update_op( + loss_scale, loss_scale_normal_steps, inc_loss_scale_every_n) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(update_op) + + self.assertEqual(sess.run(loss_scale), 4) + self.assertEqual(sess.run(loss_scale_normal_steps), 10) + + def testAppendGradientsWithLossScaleWithAutoScaleDisabled(self): + v = tf.Variable(0) + training_ops = [] + get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)] + loss_scale_params = variable_mgr_util.AutoLossScaleParams( + enable_auto_loss_scale=False, # no auto loss scale. + loss_scale=tf.Variable(4), + loss_scale_normal_steps=tf.Variable(10), + inc_loss_scale_every_n=10, + is_chief=True) + variable_mgr_util.append_gradients_with_loss_scale( + training_ops, + get_apply_gradients_ops_func, + loss_scale_params, + grad_has_inf_nan=True) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(training_ops) + self.assertEqual(sess.run(v), 1) + self.assertEqual(sess.run(loss_scale_params.loss_scale), 4) + self.assertEqual(sess.run(loss_scale_params.loss_scale_normal_steps), 10) + + def testAppendGradientsWithLossScaleForNonChiefWorker(self): + v = tf.Variable(0) + training_ops = [] + get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)] + loss_scale_params = variable_mgr_util.AutoLossScaleParams( + enable_auto_loss_scale=True, + loss_scale=tf.Variable(4), + loss_scale_normal_steps=tf.Variable(10), + inc_loss_scale_every_n=10, + is_chief=False) # Non-chief + variable_mgr_util.append_gradients_with_loss_scale( + training_ops, + get_apply_gradients_ops_func, + loss_scale_params, + grad_has_inf_nan=False) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(training_ops) + self.assertEqual(sess.run(v), 1) + self.assertEqual(sess.run(loss_scale_params.loss_scale), 4) + self.assertEqual(sess.run(loss_scale_params.loss_scale_normal_steps), 10) + + def testAppendGradientsWithLossScaleWithoutNan(self): + v = tf.Variable(0) + training_ops = [] + get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)] + loss_scale_params = variable_mgr_util.AutoLossScaleParams( + enable_auto_loss_scale=True, + loss_scale=tf.Variable(4, dtype=tf.float32), + loss_scale_normal_steps=tf.Variable(10), + inc_loss_scale_every_n=10, + is_chief=True) + variable_mgr_util.append_gradients_with_loss_scale( + training_ops, + get_apply_gradients_ops_func, + loss_scale_params, + grad_has_inf_nan=tf.constant(False)) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(training_ops) + self.assertEqual(sess.run(v), 1) + self.assertEqual(sess.run(loss_scale_params.loss_scale), 8) + self.assertEqual(sess.run(loss_scale_params.loss_scale_normal_steps), 0) + + def testAppendGradientsWithLossScaleWithtNan(self): + v = tf.Variable(0) + training_ops = [] + get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)] + loss_scale_params = variable_mgr_util.AutoLossScaleParams( + enable_auto_loss_scale=True, + loss_scale=tf.Variable(4, dtype=tf.float32), + loss_scale_normal_steps=tf.Variable(10), + inc_loss_scale_every_n=10, + is_chief=True) + variable_mgr_util.append_gradients_with_loss_scale( + training_ops, + get_apply_gradients_ops_func, + loss_scale_params, + grad_has_inf_nan=tf.constant(True)) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(training_ops) + self.assertEqual(sess.run(v), 0) # Skip updating for v. + # halve loss_scale and reset local_scale_normal_steps. + self.assertEqual(sess.run(loss_scale_params.loss_scale), 2) + self.assertEqual(sess.run(loss_scale_params.loss_scale_normal_steps), 0) + + +if __name__ == '__main__': + tf.disable_v2_behavior() + tf.test.main()