diff --git a/cv/classification/resnet50/tensorflow/README.md b/cv/classification/resnet50/tensorflow/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..66083d9525952a2446fb9c0a4baea5842808d16a
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/README.md
@@ -0,0 +1,37 @@
+
+## Prepare
+
+### Install packages
+
+```shell
+pip3 install absl-py git+https://github.com/NVIDIA/dllogger#egg=dllogger
+```
+
+### Download datasets
+
+
+[Downloading and converting to TFRecord format](https://github.com/kmonachopoulos/ImageNet-to-TFrecord)  or 
+[here](https://github.com/tensorflow/models/tree/master/research/slim#downloading-and-converting-to-tfrecord-format)
+make a file named imagenet_tfrecord, and store imagenet datasest convert to imagenet_tfrecord   
+
+
+
+## Training
+
+### Training on single card
+
+```shell
+bash run_train_resnet50_imagenette.sh
+```
+
+### Training on mutil-cards
+```shell
+bash run_train_resnet50_multigpu_imagenette.sh
+```
+
+
+## Result
+
+|               | acc       |       fps |
+| ---           | ---       | ---       |
+|    multi_card | 0.9860    | 236.9     |
\ No newline at end of file
diff --git a/cv/classification/resnet50/tensorflow/README_origin.md b/cv/classification/resnet50/tensorflow/README_origin.md
new file mode 100644
index 0000000000000000000000000000000000000000..e7b746487bcf0daad38d4522580a170ac58523f2
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/README_origin.md
@@ -0,0 +1,88 @@
+# tf_cnn_benchmarks: High performance benchmarks
+
+**Note: tf_cnn_benchmarks is no longer maintained.**
+
+tf_cnn_benchmarks contains TensorFlow 1 implementations of several popular
+convolutional models, and is designed to be as fast as possible.
+tf_cnn_benchmarks supports both running on a single machine or running in
+distributed mode across multiple hosts.
+
+tf_cnn_benchmarks is no longer maintained. Although it will run with TensorFlow
+2, it was written and optimized for TensorFlow 1, and has not been maintained
+since TensorFlow 2 was released. For clean and easy-to-read TensorFlow 2 models,
+please see the [TensorFlow Official
+Models](https://github.com/tensorflow/models/tree/master/official).
+
+## Getting Started
+
+To run ResNet50 with synthetic data without distortions with a single GPU, run
+
+```
+python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
+```
+
+Note that the master branch of tf_cnn_benchmarks occasionally requires the
+latest nightly version of TensorFlow. You can install the nightly version by
+running `pip install tf-nightly-gpu` in a clean environment, or by installing
+TensorFlow from source. We sometimes will create a branch of tf_cnn_benchmarks,
+in the form of cnn_tf_vX.Y_compatible, that is compatible with TensorFlow
+version X.Y. For example, branch
+[cnn_tf_v1.9_compatible](https://github.com/tensorflow/benchmarks/tree/cnn_tf_v1.9_compatible/scripts/tf_cnn_benchmarks)
+works with TensorFlow 1.9. However, as tf_cnn_benchmarks is no longer
+maintained, we will likely no longer create new branches.
+
+Some important flags are
+
+*   model: Model to use, e.g. resnet50, inception3, vgg16, and alexnet.
+*   num_gpus: Number of GPUs to use.
+*   data_dir: Path to data to process. If not set, synthetic data is used. To
+    use Imagenet data use these
+    [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started)
+    as a starting point.
+*   batch_size: Batch size for each GPU.
+*   variable_update: The method for managing variables: parameter_server
+    ,replicated, distributed_replicated, independent
+*   local_parameter_device: Device to use as parameter server: cpu or gpu.
+
+To see the full list of flags, run `python tf_cnn_benchmarks.py --help`.
+
+To run ResNet50 with real data with 8 GPUs, run:
+
+```
+python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=256 \
+--model=resnet50 --optimizer=momentum --variable_update=replicated \
+--nodistortions --gradient_repacking=8 --num_gpus=8 \
+--num_epochs=90 --weight_decay=1e-4 --data_dir=${DATA_DIR} --use_fp16 \
+--train_dir=${CKPT_DIR}
+```
+This will train a ResNet-50 model on ImageNet with 2048 batch size on 8
+GPUs. The model should train to around 76% accuracy.
+
+## Running the tests
+
+To run the tests, run
+
+```bash
+pip install portpicker
+python run_tests.py && python run_tests.py --run_distributed_tests
+```
+
+Note the tests require portpicker.
+
+The command above runs a subset of tests that is both fast and fairly
+comprehensive. Alternatively, all the tests can be run, but this will take a
+long time:
+
+```bash
+python run_tests.py --full_tests && python run_tests.py --full_tests --run_distributed_tests
+```
+
+We will run all tests on every PR before merging them, so it is not necessary
+to pass `--full_tests` when running tests yourself.
+
+To run an individual test, such as method `testParameterServer` of test class
+`TfCnnBenchmarksTest` of module `benchmark_cnn_test`, run
+
+```bash
+python -m unittest -v benchmark_cnn_test.TfCnnBenchmarksTest.testParameterServer
+```
diff --git a/cv/classification/resnet50/tensorflow/all_reduce_benchmark.py b/cv/classification/resnet50/tensorflow/all_reduce_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..155861c099089c59fe3439e6ef18b5e7e48d81ab
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/all_reduce_benchmark.py
@@ -0,0 +1,290 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks the all-reduce algorithms of tf_cnn_benchmarks.
+
+tf_cnn_benchmarks uses all-reduce to aggregate gradients. This benchmark is
+useful for benchmarking the performance of just this gradient aggregation,
+instead of the entire model. All the flags that tf_cnn_benchmarks accepts are
+also accepted by this script, although many are silently ignored.
+
+The number and shapes of the tensors all-reduced are those of the variables of
+the model specified by the --model flag.
+TODO(reedwm): Allow custom sizes to be specified.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import os
+import time
+
+from absl import app
+from absl import flags as absl_flags
+import tensorflow.compat.v1 as tf
+
+from tensorflow.python.ops import control_flow_ops
+import benchmark_cnn
+import cnn_util
+import flags
+from cnn_util import log_fn
+
+
+absl_flags.DEFINE_integer('iters_per_step', 5,
+                          'Number of iterations to run all-reduce for, per '
+                          'step. Every step, a session will be run on a Graph '
+                          'that contains this many copies of the all-reduce. '
+                          'The copies are run sequentially. Setting this above '
+                          '1 is useful to lower the overhead of starting the '
+                          'session run, running the VariableV2 ops at the '
+                          'start of the step, etc.')
+
+
+flags.define_flags()
+for name in flags.param_specs.keys():
+  absl_flags.declare_key_flag(name)
+
+
+def get_var_shapes(model):
+  """Returns the list of variable shapes for a tf_cnn_benchmarks Model."""
+  with tf.Graph().as_default():
+    # The variable shapes do not depend on the batch size.
+    images = tf.placeholder(tf.float32, model.get_input_shapes('train')[0])
+    model.build_network([images])
+    return [[int(d) for d in v.shape.dims] for v in tf.trainable_variables()]
+
+
+def all_reduce(all_device_tensors, variable_mgr):
+  """Performs a single batch all-reduce.
+
+  Args:
+    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
+      a tensor, where t is the tower the tensor is on and i is the index of
+      the tensor.
+    variable_mgr: The VariableMgr to perform the all-reduce.
+  Returns:
+    List of list of tensors in the same form as `all_device_tensors`, except the
+    tensors are aggregated across towers.
+  """
+  tower_grads = [[(g, None) for g in device_tensors] for
+                 device_tensors in all_device_tensors]
+  _, aggregated_tower_grads = variable_mgr.preprocess_device_grads(tower_grads)
+  return [
+      [g for g, _ in agg_device_tensors]
+      for agg_device_tensors in aggregated_tower_grads]
+
+
+def build_all_reduce_iterations(all_device_tensors, tower_devices, variable_mgr,
+                                num_iters):
+  """Builds the all-reduce ops for multiple iterations to aggregate tensors.
+
+  The tensors in `all_device_tensors` are aggregated `num_iters` times. Each
+  iteration aggregates the results from the previous iteration. The iterations
+  are run sequentially, so the aggregations for an iteration do not start
+  running until the previous iteration has completed. Each iteration after the
+  first is aggregating already-aggregated values, but it does not matter because
+  we are only aggregating for benchmarking purposes.
+
+  Args:
+    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
+      a tensor, where t is the tower the tensor is on and i is the index of
+      the tensor.
+    tower_devices: A list of device strings. tower_devices[t] is the device
+      of the tensors in all_device_tensors[t].
+    variable_mgr: The VariableMgr to perform the all-reduce.
+    num_iters: Number of iterations to aggregate tensors for.
+  Returns:
+    An op that when run, causes the all-reduce ops to run.
+  """
+  for i in range(num_iters):
+    with tf.name_scope('iteration_%d' % i):
+      # Step 1: Do the aggregation.
+      with tf.name_scope('tensor_aggregation'):
+        all_device_tensors = all_reduce(all_device_tensors, variable_mgr)
+
+      # Step 2. Create identity ops, to bring the aggregated results back to
+      # each device.
+      new_all_device_tensors = []
+      for device, device_tensors in zip(tower_devices, all_device_tensors):
+        with tf.device(device):
+          new_all_device_tensors.append([
+              tf.identity(t, name='identity_after_allreduce')
+              for t in device_tensors
+          ])
+      all_device_tensors = new_all_device_tensors
+
+      # Step 3. Add control dependencies to delay the next iteration until this
+      # iteration is complete. To avoid extra overhead, we do not have any
+      # cross-device control dependencies, which means it's possible for two
+      # iterations to slightly overlap.
+      new_all_device_tensors = []
+      for device_tensors in all_device_tensors:
+        new_all_device_tensors.append([
+            control_flow_ops.with_dependencies(
+                device_tensors, t, name='identity_after_dependencies')
+            for t in device_tensors
+        ])
+      all_device_tensors = new_all_device_tensors
+
+  # To prevent the dependency optimizer from removing every op we created,
+  # we store the results in variables.
+  ops_to_run = []
+  for device, device_tensors in zip(tower_devices, all_device_tensors):
+    with tf.device(device):
+      for t in device_tensors:
+        # The placeholder initial value is never run.
+        var = tf.Variable(tf.placeholder(tf.float32, t.shape), collections=[])
+        ops_to_run.append(var.assign(t))
+  return tf.group(*ops_to_run)
+
+
+def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters):
+  """Builds the graph for the benchmark.
+
+  Args:
+    tower_devices: A list of device strings of the devices to run the all-reduce
+      benchmark on.
+    tensor_shapes: A list of shapes of the tensors that will be aggregated for
+      the all-reduce.
+    variable_mgr: The VariableMgr to perform the all-reduce.
+    num_iters: Number of iterations to aggregate tensors for.
+  Returns:
+    An op that runs the benchmark.
+  """
+  all_device_tensors = []
+  for i, tower_device in enumerate(tower_devices):
+    with tf.device(tower_device):
+      device_tensors = []
+      for j, shape in enumerate(tensor_shapes):
+        tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32),
+                             name='tensor_%d_on_device_%d' % (j, i))
+        device_tensors.append(tensor)
+    all_device_tensors.append(device_tensors)
+
+  log_fn('Building all-reduce ops')
+  benchmark_op = build_all_reduce_iterations(all_device_tensors, tower_devices,
+                                             variable_mgr, num_iters)
+  log_fn('Done building all-reduce ops')
+  return benchmark_op
+
+
+def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op):
+  """Runs the graph for the benchmark.
+
+  Args:
+    benchmark_op: An op that runs the benchmark.
+    bench_cnn: The BenchmarkCNN where params and other attributes are obtained.
+    init_ops: A list of ops that are run before `benchmark_op` for
+      initialization.
+    dummy_loss_op: Any op. We must pass a loss op to
+      `benchmark_cnn.benchmark_one_step`, but the result of the op is never
+      actually used.
+  """
+  config = benchmark_cnn.create_config_proto(bench_cnn.params)
+  with tf.Session(config=config) as sess:
+    for op in init_ops:
+      sess.run(op)
+    step_train_times = []
+    fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op}
+    log_fn('Running warmup')
+    for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches):
+      if i == 0:
+        log_fn('Running all-reduce ops')
+        start = time.time()
+      if i > 0 and i % bench_cnn.params.display_every == 0:
+        log_fn('Iteration: %d. Average time per step so far: %s' %
+               (i, (time.time() - start) / i))
+      # Call benchmark_one_step instead of directly calling sess.run(...), to
+      # potentially get a trace file, partitioned graphs, etc.
+      benchmark_cnn.benchmark_one_step(
+          sess=sess,
+          fetches=fetches,
+          step=i,
+          # The batch size is only used for the images/sec calculation, which is
+          # not actually calculated because we pass show_images_per_sec=False.
+          batch_size=None,
+          step_train_times=step_train_times,
+          trace_filename=bench_cnn.trace_filename,
+          partitioned_graph_file_prefix=(
+              bench_cnn.params.partitioned_graph_file_prefix),
+          profiler=None,
+          image_producer=None,
+          params=bench_cnn.params,
+          show_images_per_sec=False)
+    log_fn('Average time per step: %s' %
+           ((time.time() - start) / bench_cnn.num_batches))
+
+
+def run_benchmark(bench_cnn, num_iters):
+  """Runs the all-reduce benchmark.
+
+  Args:
+    bench_cnn: The BenchmarkCNN where params, the variable manager, and other
+      attributes are obtained.
+    num_iters: Number of iterations to do all-reduce for for.
+
+  Raises:
+    ValueError: Invalid params of bench_cnn.
+  """
+  if bench_cnn.params.variable_update != 'replicated':
+    raise ValueError('--variable_update=replicated must be specified to use'
+                     'the all-reduce benchmark')
+  if bench_cnn.params.variable_consistency == 'relaxed':
+    raise ValueError('--variable_consistency=relaxed is not supported')
+
+  benchmark_op = build_graph(bench_cnn.raw_devices,
+                             get_var_shapes(bench_cnn.model),
+                             bench_cnn.variable_mgr, num_iters)
+  init_ops = [
+      tf.global_variables_initializer(),
+      bench_cnn.variable_mgr.get_post_init_ops()
+  ]
+  loss_op = tf.no_op()
+
+  if bench_cnn.graph_file:
+    path, filename = os.path.split(bench_cnn.graph_file)
+    as_text = filename.endswith('txt')
+    log_fn('Writing GraphDef as %s to %s' % (
+        'text' if as_text else 'binary', bench_cnn.graph_file))
+    tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
+                         path, filename, as_text)
+
+  run_graph(benchmark_op, bench_cnn, init_ops, loss_op)
+
+
+# TODO(reedwm): Reduce redundancy with tf_cnn_benchmarks
+def main(positional_arguments):
+  # Command-line arguments like '--distortions False' are equivalent to
+  # '--distortions=True False', where False is a positional argument. To prevent
+  # this from silently running with distortions, we do not allow positional
+  # arguments.
+  assert len(positional_arguments) >= 1
+  if len(positional_arguments) > 1:
+    raise ValueError('Received unknown positional arguments: %s'
+                     % positional_arguments[1:])
+
+  params = benchmark_cnn.make_params_from_flags()
+  params = benchmark_cnn.setup(params)
+  bench = benchmark_cnn.BenchmarkCNN(params)
+
+  tfversion = cnn_util.tensorflow_version_tuple()
+  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
+
+  run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  app.run(main)  # Raises error on invalid flags, unlike tf.app.run()
diff --git a/cv/classification/resnet50/tensorflow/all_reduce_benchmark_test.py b/cv/classification/resnet50/tensorflow/all_reduce_benchmark_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8efd53f421049e697a4eeea7486a758c5a52a6c
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/all_reduce_benchmark_test.py
@@ -0,0 +1,52 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for all_reduce_benchmark.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+import all_reduce_benchmark
+import benchmark_cnn
+import test_util
+
+
+class AllReduceBenchmarkTest(tf.test.TestCase):
+  """Tests the all-reduce benchmark."""
+
+  def _test_run_benchmark(self, params):
+    """Tests that run_benchmark() runs successfully with the params."""
+    logs = []
+    with test_util.monkey_patch(all_reduce_benchmark,
+                                log_fn=test_util.print_and_add_to_list(logs)):
+      bench_cnn = benchmark_cnn.BenchmarkCNN(params)
+      all_reduce_benchmark.run_benchmark(bench_cnn, num_iters=5)
+      self.assertRegex(logs[-1], '^Average time per step: [0-9.]+$')
+
+  def test_run_benchmark(self):
+    """Tests that run_benchmark() runs successfully."""
+    params = benchmark_cnn.make_params(num_batches=10,
+                                       variable_update='replicated',
+                                       num_gpus=2)
+    self._test_run_benchmark(params)
+    params = params._replace(hierarchical_copy=True, gradient_repacking=8,
+                             num_gpus=8)
+    self._test_run_benchmark(params)
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/allreduce.py b/cv/classification/resnet50/tensorflow/allreduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa51f843444b543622ec01c3322a282ea0fc5139
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/allreduce.py
@@ -0,0 +1,648 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for allreduce."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as pycoll
+import re
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+try:
+  from tensorflow.python.distribute.v1 import all_reduce
+except:
+  from tensorflow.python.distribute import all_reduce
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import collective_ops
+
+AllReduceSpecTuple = pycoll.namedtuple('AllReduceSpecTuple', 'alg shards limit')
+
+
+def parse_general_int(s):
+  """Parse integer with power-of-2 suffix eg. 32k."""
+  mo = re.match(r'(\d+)([KkMGT]?)$', s)
+  if mo:
+    i, suffix = mo.group(1, 2)
+    v = int(i)
+    if suffix:
+      if suffix == 'K' or suffix == 'k':
+        v *= 1024
+      elif suffix == 'M':
+        v *= (1024 * 1024)
+      elif suffix == 'G':
+        v *= (1024 * 1024 * 1024)
+      elif suffix == 'T':
+        v *= (1024 * 1024 * 1024 * 1024)
+      else:
+        raise ValueError('invalid integer string %s' % s)
+    return v
+  else:
+    v = int(s)
+  return v
+
+
+def parse_all_reduce_spec(all_reduce_spec):
+  """Parse all_reduce_spec.
+
+  Args:
+    all_reduce_spec: a string specifying a combination of all-reduce
+      algorithms to apply for gradient reduction.
+
+  Returns:
+    a list of AllReduceSpecTuple.
+
+  Raises:
+    ValueError: all_reduce_spec is not well-formed.
+
+  An all_reduce_spec has BNF form:
+     int ::= positive whole number
+     g_int ::= int[KkMGT]?
+     alg_spec ::= alg | alg#int
+     range_spec ::= alg_spec | alg_spec/alg_spec
+     spec ::= range_spec | range_spec:g_int:range_spec
+
+  Not all syntactically correct specifications are supported.
+  Examples of supported all_reduce_spec strings, with semantics explained:
+
+    'collective' == apply tf.collective_reduce operator to all tensors.
+    'collective#2' == apply tf.collective_reduce operator to all tensors,
+            requesting up to 2 simultaneous transfers at each node, if
+            feasible, by subdividing tensor by an additional factor of 2.
+    'xring' == apply ring all-reduce to all tensors
+    'xring#2' == apply ring all-reduce to all tensors, using two simultaneous
+            transfer rings, each operating on 1/2 of each tensor.
+    'nccl'  == apply NCCL all-reduce to all tensors (only works within
+            a single worker process where all devices are GPUs)
+    'nccl/xring' == apply NCCL all-reduce to all tensors within each worker
+            to produce at least one full-reduced (locally) value,
+            then apply ring all-reduce to one such value from each
+            worker, then apply NCCL broadcast to propagate those globally
+            reduced values back to every device within each worker.
+    'pscpu' == Shuffle reduce using worker CPUs as the gather devices: each
+            distributed tensor is reduced by copying all instances to
+            one of the worker CPUs, computing the reduction there, then
+            copying back to each participating device.  Tensor reductions
+            are assigned to specific CPUs round-robin.
+    'psgpu#4' == Arrange all GPUs across all workers into groups of 4.
+            Each distributed tensor is shuffle reduced against one
+            such group of 4 GPUs, selected round-robin.  That is, each
+            tensor is split across 4 shards for the reduction.
+    'pscpu:2k:pscpu#2:64k:xring' == Apply single-shard pscpu to
+            tensors of size <= 2048 elements, apply 2-shard pscpu to
+            tensors up to size 64k elements, apply xring to larger tensors.
+    'pscpu/pscpu#2' == Use shuffle gather to locally reduce each tensor on
+            the worker's CPU, then use 2-shard shuffle to reduce those
+            locally reduced tensors across workers (on the worker CPUs), then
+            scatter the globally reduced values locally from each worker CPU.
+  """
+  range_parts = all_reduce_spec.split(':') + ['-1']
+  if len(range_parts) % 2:
+    raise ValueError('all_reduce_spec not well formed: %s' % all_reduce_spec)
+  limit = 0
+  spec = []
+  alg = None
+  shards = 1
+  for i, range_part in enumerate(range_parts):
+    if i % 2 == 1:
+      try:
+        limit = parse_general_int(range_part)
+        spec.append(AllReduceSpecTuple(alg=alg, shards=shards, limit=limit))
+      except ValueError:
+        raise ValueError('all_reduce_spec (%s) contains non-integer range %s' %
+                         (all_reduce_spec, range_part))
+    else:
+      alg = range_part
+      alg_parts = range_part.split('#')
+      alg = alg_parts[0]
+      if len(alg_parts) > 1:
+        try:
+          shards = int(alg_parts[1])
+        except ValueError:
+          raise ValueError('all_reduce_spec (%s) contains non-integer '
+                           'shards %s' % all_reduce_spec, alg_parts[1])
+      else:
+        shards = 1
+      if alg not in [
+          'nccl', 'nccl/xring', 'nccl/rechd', 'nccl/pscpu', 'xring', 'pscpu',
+          'psgpu', 'pscpu/pscpu', 'collective'
+      ]:
+        raise ValueError('all_reduce_spec (%s) contains invalid alg %s' %
+                         (all_reduce_spec, alg))
+  return spec
+
+
+def build_all_reduce_device_prefixes(job_name, num_tasks):
+  """Build list of device prefix names for all_reduce.
+
+  Args:
+    job_name: 'worker', 'ps' or 'localhost'.
+    num_tasks: number of jobs across which device names should be generated.
+
+  Returns:
+     A list of device name prefix strings. Each element spells out the full
+     host name without adding the device.
+     e.g. '/job:worker/task:0'
+  """
+  if job_name != 'localhost':
+    return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)]
+  else:
+    assert num_tasks == 1
+    return ['/job:%s' % job_name]
+
+
+def group_device_names(devices, group_size):
+  """Group device names into groups of group_size.
+
+  Args:
+    devices: list of strings naming devices.
+    group_size: int >= 1
+
+  Returns:
+    list of lists of devices, where each inner list is group_size long,
+      and each device appears at least once in an inner list.  If
+      len(devices) % group_size = 0 then each device will appear
+      exactly once.
+
+  Raises:
+    ValueError: group_size > len(devices)
+  """
+  num_devices = len(devices)
+  if group_size > num_devices:
+    raise ValueError('only %d devices, but group_size=%d' % (num_devices,
+                                                             group_size))
+  num_groups = (
+      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
+  groups = [[] for i in range(num_groups)]
+  for i in range(0, num_groups * group_size):
+    groups[i % num_groups].append(devices[i % num_devices])
+  return groups
+
+
+def split_grads_by_size(threshold_size, device_grads):
+  """Break gradients into two sets according to tensor size.
+
+  Args:
+    threshold_size: int size cutoff for small vs large tensor.
+    device_grads: List of lists of (gradient, variable) tuples.  The outer
+        list is over devices. The inner list is over individual gradients.
+
+  Returns:
+    small_grads: Subset of device_grads where shape is <= theshold_size
+       elements.
+    large_grads: Subset of device_grads where shape is > threshold_size
+       elements.
+  """
+  small_grads = []
+  large_grads = []
+  for dl in device_grads:
+    small_dl = []
+    large_dl = []
+    for (g, v) in dl:
+      tensor_size = g.get_shape().num_elements()
+      if tensor_size <= threshold_size:
+        small_dl.append([g, v])
+      else:
+        large_dl.append([g, v])
+    if small_dl:
+      small_grads.append(small_dl)
+    if large_dl:
+      large_grads.append(large_dl)
+  return small_grads, large_grads
+
+
+_instance_key = 1
+
+
+def new_collective_instance_key():
+  """Returns a new instance key for use in defining a collective op."""
+  global _instance_key
+  v = _instance_key
+  _instance_key += 1
+  return v
+
+
+_group_key = 1
+_group_key_table = dict()
+
+
+def collective_group_key(devices):
+  """Returns a group key for the set of devices.
+
+  Args:
+    devices: list of strings naming devices in a collective group.
+
+  Returns:
+    int key uniquely identifying the set of device names.
+  """
+  global _group_key
+  global _group_key_table
+  parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
+  names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
+  concat = ','.join(names)
+  if concat not in _group_key_table.keys():
+    new_key = _group_key
+    _group_key += 1
+    _group_key_table[concat] = new_key
+  rv = _group_key_table[concat]
+  return rv
+
+
+def build_collective_reduce(input_tensors, num_workers, num_shards,
+                            red_op='Add', un_op='Id'):
+  """Build a subgraph that does one full all-reduce, using the collective Op.
+
+  Args:
+    input_tensors: tensors within a single worker graph that are to be reduced
+      together; must be one per device.
+    num_workers: total number of workers with identical independent graphs that
+      will be doing this same reduction.  The reduction will actually include
+      the corresponding tensors at all these workers.
+    num_shards: number of shards into which to divide each per-tick chunk,
+      normally 1 but could be higher on multi-data-path architectures.
+    red_op: string naming the reduction op
+    un_op: string naming the unary final op
+
+  Returns:
+    An array of final tensors, one per device, computed by the full reduction.
+
+  Raises:
+    ValueError: There must be at least two tensors over all the workers.
+  """
+  group_size = len(input_tensors) * num_workers
+  if group_size < 2:
+    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
+  devices = [t.device for t in input_tensors]
+  num_devices = len(devices)
+  group_key = collective_group_key(devices)
+  instance_key = new_collective_instance_key()
+  out_tensors = []
+  if num_shards == 1:
+    subdiv_offsets = [0]
+  elif num_shards == 2:
+    if num_devices > 1:
+      subdiv_offsets = [0, -(num_devices // 2)]
+    else:
+      subdiv_offsets = [0]
+  else:
+    raise ValueError('Unsupported num_shards %d' % num_shards)
+  for d in range(num_devices):
+    with ops.device(devices[d]):
+      reduce_op = collective_ops.all_reduce(input_tensors[d],
+                                            group_size, group_key, instance_key,
+                                            red_op, un_op,
+                                            subdiv_offsets)
+      out_tensors.append(reduce_op)
+  return out_tensors
+
+
+def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
+  return collective_ops.broadcast_send(t, shape, dtype, group_size, group_key,
+                                       instance_key)
+
+
+def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
+  return collective_ops.broadcast_recv(shape, dtype, group_size, group_key,
+                                       instance_key)
+
+
+def sum_grad_and_var_all_reduce(single_session,
+                                grad_and_vars,
+                                num_workers,
+                                alg,
+                                gpu_indices,
+                                aux_devices=None,
+                                num_shards=1):
+  """Apply all-reduce algorithm over specified gradient tensors."""
+  scaled_grads = [g for g, _ in grad_and_vars]
+  if alg == 'collective':
+    assert not single_session
+    summed_grads = build_collective_reduce(
+        scaled_grads, num_workers, num_shards, 'Add', 'Id')
+  else:
+    with tf.name_scope('allreduce'):
+      # Note that each grad_and_vars looks like the following:
+      #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+      if alg == 'nccl':
+        summed_grads = all_reduce.build_nccl_all_reduce(scaled_grads, tf.add)
+      elif alg == 'xring':
+        summed_grads = all_reduce.build_ring_all_reduce(
+            scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
+      elif alg == 'nccl/xring':
+        summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
+                                                       tf.add)
+      elif alg == 'nccl/rechd':
+        summed_grads = all_reduce.build_nccl_then_recursive_hd(
+            scaled_grads, tf.add)
+      elif alg == 'nccl/pscpu':
+        summed_grads = all_reduce.build_nccl_then_shuffle(
+            scaled_grads, aux_devices, tf.add, tf.add_n)
+      elif alg == 'pscpu/pscpu':
+        summed_grads = all_reduce.build_shuffle_then_shuffle(
+            scaled_grads,
+            aux_devices,
+            # TODO(tucker): devise a way of better specifying the device set
+            # for the second level.
+            [aux_devices[0]],
+            tf.add_n)
+      elif alg in ['pscpu', 'psgpu']:
+        summed_grads = all_reduce.build_shuffle_all_reduce(
+            scaled_grads, aux_devices, tf.add_n)
+      else:
+        raise ValueError('unsupported all_reduce alg: ', alg)
+
+  result = []
+  for (_, v), g in zip(grad_and_vars, summed_grads):
+    result.append([g, v])
+  return result
+
+
+def contains_any(haystack, needles):
+  """Tests if any needle is a substring of haystack.
+
+  Args:
+    haystack: a string
+    needles: list of strings
+
+  Returns:
+    True if any element of needles is a substring of haystack,
+      False otherwise.
+  """
+  for n in needles:
+    if n in haystack:
+      return True
+  return False
+
+
+def sum_gradients_all_reduce(single_session,
+                             dev_prefixes,
+                             tower_grads,
+                             num_workers,
+                             alg,
+                             num_shards,
+                             gpu_indices,
+                             agg_small_grads_max_bytes=0,
+                             agg_small_grads_max_group=10,
+                             allreduce_merge_scope=1):
+  """Apply all-reduce algorithm over specified gradient tensors.
+
+  Args:
+    single_session: true if reduction is applied to one graph across
+      all workers, false if ths application is to a single-worker graph only.
+    dev_prefixes: list of prefix strings to use to generate PS device names.
+    tower_grads: the gradients to reduce.
+    num_workers: number of worker processes across entire job.
+    alg: the all-reduce algorithm to apply.
+    num_shards: alg-specific sharding factor.
+    gpu_indices: indices of local GPUs in order usable for ring-reduce.
+    agg_small_grads_max_bytes: largest tensor eligible for aggregation,
+      in number of bytes.
+    agg_small_grads_max_group: largest permitted aggregation of small
+      tensors.
+    allreduce_merge_scope: size of groups into which to partition consecutive
+      gradients grouped under a common 'allreduce' name scope for application
+      of ScopedAllocator optimization.
+
+  Returns:
+    list of reduced tensors
+  """
+  alg_contains_shuffle = contains_any(alg, ['pscpu', 'psgpu'])
+  is_hierarchical = '/' in alg
+  if 'pscpu' in alg:
+    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
+  elif 'psgpu' in alg:
+    aux_devices = [
+        prefix + '/gpu:%d' % i
+        for i in range(len(gpu_indices))
+        for prefix in dev_prefixes
+    ]
+  else:
+    aux_devices = ['/job:localhost/cpu:0']
+  aux_device_groups = group_device_names(
+      aux_devices,
+      num_shards if (alg != 'collective' and alg_contains_shuffle) else 1)
+  group_index = 0
+  if agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
+    tower_grads, packing = pack_small_tensors(
+        tower_grads,
+        max_bytes=agg_small_grads_max_bytes,
+        max_group=agg_small_grads_max_group)
+  else:
+    packing = None
+  reduced_gv_list = []
+  gv = list(zip(*tower_grads))
+  merge_scope = allreduce_merge_scope if allreduce_merge_scope > 0 else 1
+  chunked_gv = [gv[x:x + merge_scope]
+                for x in xrange(0, len(gv), merge_scope)]
+  for chunk in chunked_gv:
+    with tf.name_scope('allreduce'):
+      for grad_and_vars in chunk:
+        reduced_gv_list.append(sum_grad_and_var_all_reduce(
+            single_session,
+            grad_and_vars, num_workers, alg, gpu_indices,
+            (aux_devices if is_hierarchical
+             else aux_device_groups[group_index]),
+            num_shards))
+        group_index = (group_index + 1) % len(aux_device_groups)
+  new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
+  if packing:
+    new_tower_grads = unpack_small_tensors(new_tower_grads, packing)
+  return new_tower_grads
+
+
+def extract_ranges(index_list, range_size_limit=32):
+  """Extract consecutive ranges and singles from index_list.
+
+  Args:
+    index_list: List of monotone increasing non-negative integers.
+    range_size_limit: Largest size range to return.  If a larger
+      consecutive range exists it will be returned as multiple
+      ranges.
+
+  Returns:
+   ranges, singles where ranges is a list of [first, last] pairs of
+     consecutive elements in index_list, and singles is all of the
+     other elements, in original order.
+  """
+  if not index_list:
+    return [], []
+  first = index_list[0]
+  last = first
+  ranges = []
+  singles = []
+  for i in index_list[1:]:
+    if i == last + 1 and (last - first) <= range_size_limit:
+      last = i
+    else:
+      if last > first:
+        ranges.append([first, last])
+      else:
+        singles.append(first)
+      first = i
+      last = i
+  if last > first:
+    ranges.append([first, last])
+  else:
+    singles.append(first)
+  return ranges, singles
+
+
+GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
+
+
+def pack_range(key, packing, grad_vars, rng):
+  """Form the concatenation of a specified range of gradient tensors.
+
+  Args:
+    key: Value under which to store meta-data in packing that will be used
+      later to restore the grad_var list structure.
+    packing: Dict holding data describing packed ranges of small tensors.
+    grad_vars: List of (grad, var) pairs for one tower.
+    rng: A pair of integers giving the first, last indices of a consecutive
+      range of tensors to be packed.
+
+  Returns:
+    A tensor that is the concatenation of all the specified small tensors.
+  """
+  to_pack = grad_vars[rng[0]:rng[1] + 1]
+  members = []
+  variables = []
+  restore_shapes = []
+  with tf.name_scope('pack'):
+    for g, v in to_pack:
+      variables.append(v)
+      restore_shapes.append(g.shape)
+      with tf.device(g.device):
+        members.append(tf.reshape(g, [-1]))
+    packing[key] = GradPackTuple(
+        indices=range(rng[0], rng[1] + 1),
+        vars=variables,
+        shapes=restore_shapes)
+    with tf.device(members[0].device):
+      return tf.concat(members, 0)
+
+
+def unpack_grad_tuple(gv, gpt):
+  """Unpack a previously packed collection of gradient tensors.
+
+  Args:
+    gv: A (grad, var) pair to be unpacked.
+    gpt: A GradPackTuple describing the packing operation that produced gv.
+
+  Returns:
+    A list of (grad, var) pairs corresponding to the values that were
+     originally packed into gv, maybe following subsequent operations like
+     reduction.
+  """
+  elt_widths = [x.num_elements() for x in gpt.shapes]
+  with tf.device(gv[0][0].device):
+    with tf.name_scope('unpack'):
+      splits = tf.split(gv[0], elt_widths)
+      unpacked_gv = []
+      for idx, s in enumerate(splits):
+        unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx]))
+  return unpacked_gv
+
+
+def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
+  """Concatenate small gradient tensors together for reduction.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples.
+    max_bytes: Int giving max number of bytes in a tensor that
+      may be considered small.
+    max_group: Int giving max number of small tensors that may be
+      concatenated into one new tensor.
+
+  Returns:
+    new_tower_grads, packing where new_tower_grads is identical to
+      tower_grads except that all feasible small_tensors have been removed
+      from their places and concatenated into larger tensors that are
+      now in the front of the list for each tower, and packing contains
+      the data necessary to restore the tower_grads structure.
+
+  Look through the first tower for gradients of the same type (float),
+  and small size, that are all sequential.  For each such group,
+  replace by a new tensor that is a flattened concatenation.  Note
+  that the corresponding variable will be absent, which doesn't matter
+  because it isn't used during all-reduce.
+
+  Requires:
+    Every gv_list in towers must have isomorphic structure including identical
+      tensor sizes and types.
+  """
+  small_indices = []
+  large_indices = []
+  for idx, (g, _) in enumerate(tower_grads[0]):
+    if g.dtype == tf.float32 and (4 * g.shape.num_elements()) <= max_bytes:
+      small_indices.append(idx)
+    else:
+      large_indices.append(idx)
+  small_ranges, small_singles = extract_ranges(
+      small_indices, range_size_limit=max_group)
+  large_indices = sorted(large_indices + small_singles)
+  num_gv = len(tower_grads[0])
+  packing = {}
+  if small_ranges:
+    new_tower_grads = []
+    for dev_idx, gv_list in enumerate(tower_grads):
+      assert len(gv_list) == num_gv
+      new_gv_list = []
+      for r in small_ranges:
+        key = '%d:%d' % (dev_idx, len(new_gv_list))
+        new_gv_list.append((pack_range(key, packing, gv_list, r),
+                            'packing_var_placeholder'))
+      for i in large_indices:
+        new_gv_list.append(gv_list[i])
+      new_tower_grads.append(new_gv_list)
+    return new_tower_grads, packing
+  else:
+    return tower_grads, None
+
+
+def unpack_small_tensors(tower_grads, packing):
+  """Undo the structure alterations to tower_grads done by pack_small_tensors.
+
+  Args:
+    tower_grads: List of List of (grad, var) tuples.
+    packing: A dict generated by pack_small_tensors describing the changes
+      it made to tower_grads.
+
+  Returns:
+    new_tower_grads: identical to tower_grads except that concatentations
+      of small tensors have been split apart and returned to their original
+      positions, paired with their original variables.
+  """
+  if not packing:
+    return tower_grads
+  new_tower_grads = []
+  num_devices = len(tower_grads)
+  num_packed = len(packing.keys()) // num_devices
+  for dev_idx, gv_list in enumerate(tower_grads):
+    new_gv_list = gv_list[num_packed:]
+    for i in xrange(0, num_packed):
+      k = '%d:%d' % (dev_idx, i)
+      gpt = packing[k]
+      gv = unpack_grad_tuple(gv_list[i], gpt)
+      for gi, idx in enumerate(gpt.indices):
+        assert idx == gpt.indices[gi]
+        new_gv_list.insert(idx, gv[gi])
+    new_tower_grads.append(new_gv_list)
+  return new_tower_grads
diff --git a/cv/classification/resnet50/tensorflow/allreduce_test.py b/cv/classification/resnet50/tensorflow/allreduce_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a372d7ebfbaa4d4d42921549be67d7d7683837a3
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/allreduce_test.py
@@ -0,0 +1,448 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tf_cnn_benchmark.allreduce."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as pycoll
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+import allreduce
+
+
+class AllReduceTest(tf.test.TestCase):
+
+  def testGroupKey(self):
+    d0 = ['/job:worker/replica:0/task:0/device:GPU:1',
+          '/job:worker/replica:0/task:0/device:GPU:0',
+          '/job:worker/replica:0/task:0/device:GPU:3',]
+    d1 = ['/job:worker/replica:0/task:1/device:GPU:1',
+          '/job:worker/replica:0/task:1/device:GPU:0',
+          '/job:worker/replica:0/task:1/device:GPU:3',]
+    d2 = ['/job:worker/replica:0/task:1/device:GPU:1',
+          '/job:worker/replica:0/task:1/device:GPU:3',
+          '/job:worker/replica:0/task:1/device:GPU:0',]
+    d3 = ['/job:worker/replica:0/task:1/device:GPU:1',
+          '/job:worker/replica:0/task:1/device:GPU:3',
+          '/job:worker/replica:0/task:1/device:GPU:2',]
+    d4 = ['/job:worker/task:0/device:GPU:1',
+          '/job:worker/task:0/device:GPU:2',
+          '/job:worker/task:0/device:GPU:3',]
+    d5 = ['/job:worker/task:0/device:CPU:1',
+          '/job:worker/task:0/device:CPU:2']
+    d6 = ['/job:worker/task:0/device:CPU:2',
+          '/job:worker/task:0/device:CPU:1']
+    g0 = allreduce.collective_group_key(d0)
+    g1 = allreduce.collective_group_key(d1)
+    g2 = allreduce.collective_group_key(d2)
+    g3 = allreduce.collective_group_key(d3)
+    g4 = allreduce.collective_group_key(d4)
+    g5 = allreduce.collective_group_key(d5)
+    g6 = allreduce.collective_group_key(d6)
+    self.assertEqual(g0, g1)
+    self.assertEqual(g0, g2)
+    self.assertTrue(g0 != g3)
+    self.assertEqual(g3, g4)
+    self.assertEqual(g5, g6)
+    self.assertTrue(g4 != g5)
+
+  def testExtractRanges(self):
+    x = []
+    expected_ranges = []
+    expected_singles = []
+    ranges, singles = allreduce.extract_ranges(x)
+    self.assertEqual(expected_ranges, ranges)
+    self.assertEqual(expected_singles, singles)
+    x = [1, 3, 4, 6, 7, 8, 9]
+    expected_ranges = [[3, 4], [6, 9]]
+    expected_singles = [1]
+    ranges, singles = allreduce.extract_ranges(x)
+    self.assertEqual(expected_ranges, ranges)
+    self.assertEqual(expected_singles, singles)
+    x = [1, 2, 3, 4, 6, 7, 8, 9]
+    expected_ranges = [[1, 4], [6, 9]]
+    expected_singles = []
+    ranges, singles = allreduce.extract_ranges(x)
+    self.assertEqual(expected_ranges, ranges)
+    self.assertEqual(expected_singles, singles)
+    x = [1, 3, 4, 6, 7, 9]
+    expected_ranges = [[3, 4], [6, 7]]
+    expected_singles = [1, 9]
+    ranges, singles = allreduce.extract_ranges(x)
+    self.assertEqual(expected_ranges, ranges)
+    self.assertEqual(expected_singles, singles)
+    x = [1, 3, 6, 9]
+    expected_ranges = []
+    expected_singles = [1, 3, 6, 9]
+    ranges, singles = allreduce.extract_ranges(x)
+    self.assertEqual(expected_ranges, ranges)
+    self.assertEqual(expected_singles, singles)
+
+  def testPackRange(self):
+    packing = {}
+    t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
+    t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
+
+    gv = [(t0, 'v0'), (t1, 'v1')]
+    new_t = allreduce.pack_range('0:0', packing, gv, [0, 1])
+    self.assertEqual(1, new_t.shape.ndims)
+    self.assertEqual(8, new_t.shape.dims[0])
+    self.assertEqual(
+        packing, {
+            '0:0':
+                allreduce.GradPackTuple(
+                    indices=range(2),
+                    vars=['v0', 'v1'],
+                    shapes=[tf.TensorShape([4]),
+                            tf.TensorShape([4])])
+        })
+
+    t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
+    t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
+    gv = [(t0, 'v0'), (t1, 'v1'), (t2, 'v2'), (t3, 'v3')]
+    packing = {}
+    new_t = allreduce.pack_range('1:0', packing, gv, [0, 3])
+    self.assertEqual(1, new_t.shape.ndims)
+    self.assertEqual(26, new_t.shape.dims[0])
+    self.assertEqual(
+        packing, {
+            '1:0':
+                allreduce.GradPackTuple(
+                    indices=range(4),
+                    vars=['v0', 'v1', 'v2', 'v3'],
+                    shapes=[
+                        tf.TensorShape([4]),
+                        tf.TensorShape([4]),
+                        tf.TensorShape([3, 3]),
+                        tf.TensorShape([3, 3])
+                    ])
+        })
+
+  def testUnpackGradTuple(self):
+    packing = {
+        '0:0':
+            allreduce.GradPackTuple(
+                indices=range(4),
+                vars=['v0', 'v1', 'v2', 'v3'],
+                shapes=[
+                    tf.TensorShape([4]),
+                    tf.TensorShape([4]),
+                    tf.TensorShape([3, 3]),
+                    tf.TensorShape([3, 3])
+                ])
+    }
+    tc = tf.constant([0, 1, 2, 3, 4, 5, 6, 7,
+                      0, 1, 2, 3, 4, 5, 6, 7, 8,
+                      0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
+    packed_gv = [tc, 'packing_var_placeholder']
+    gv = allreduce.unpack_grad_tuple(packed_gv, packing['0:0'])
+    self.assertEqual(4, len(gv))
+    self.assertEqual('v0', gv[0][1])
+    self.assertEqual('v1', gv[1][1])
+    self.assertEqual('v2', gv[2][1])
+    self.assertEqual('v3', gv[3][1])
+    self.assertEqual(1, gv[0][0].shape.ndims)
+    self.assertEqual(4, gv[0][0].shape.dims[0])
+    self.assertEqual(1, gv[1][0].shape.ndims)
+    self.assertEqual(4, gv[1][0].shape.dims[0])
+    self.assertEqual(2, gv[2][0].shape.ndims)
+    self.assertEqual(3, gv[2][0].shape.dims[0])
+    self.assertEqual(3, gv[2][0].shape.dims[1])
+
+  def testPackSmallTensors(self):
+    t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
+    t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
+    t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
+    t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
+    tower_grads = []
+    for d in range(0, 3):
+      gv = [(t0, 'v_%d_0' % d), (t1, 'v_%d_1' %d), (t2, 'v_%d_2' %d),
+            (t3, 'v_%d_3' % d)]
+      tower_grads.append(gv)
+
+    # 1) Set the size limit so small that nothing gets concatenated.
+    new_tower_grads, packing = allreduce.pack_small_tensors(
+        tower_grads, max_bytes=12,
+        max_group=10)
+    self.assertEqual(tower_grads, new_tower_grads)
+    self.assertTrue(packing is None)
+
+    # 2) Set the size limit so only the first two tensors get concatenated
+    new_tower_grads, packing = allreduce.pack_small_tensors(
+        tower_grads, max_bytes=16,  # 16 bytes == 4 elements
+        max_group=10)
+    self.assertEqual(3, len(new_tower_grads))
+    self.assertEqual(4, len(tower_grads[0]))
+    first_tower = new_tower_grads[0]
+    self.assertEqual(3, len(first_tower))
+    self.assertEqual(1, first_tower[0][0].shape.ndims)
+    self.assertEqual(8, first_tower[0][0].shape.dims[0])
+    self.assertEqual(packing,
+                     {'0:0': allreduce.GradPackTuple(
+                         indices=range(2),
+                         vars=['v_0_0', 'v_0_1'],
+                         shapes=[tf.TensorShape([4]),
+                                 tf.TensorShape([4])]),
+                      '1:0': allreduce.GradPackTuple(
+                          indices=range(2),
+                          vars=['v_1_0', 'v_1_1'],
+                          shapes=[tf.TensorShape([4]),
+                                  tf.TensorShape([4])]),
+                      '2:0': allreduce.GradPackTuple(
+                          indices=range(2),
+                          vars=['v_2_0', 'v_2_1'],
+                          shapes=[tf.TensorShape([4]),
+                                  tf.TensorShape([4])])})
+
+    # 3) Set the size limit so all tensors get concatenated
+    new_tower_grads, packing = allreduce.pack_small_tensors(
+        tower_grads, max_bytes=256,   # bytes = 64 elements
+        max_group=10)
+    self.assertEqual(3, len(new_tower_grads))
+    self.assertEqual(4, len(tower_grads[0]))
+    self.assertEqual(1, len(new_tower_grads[0]))
+    first_tower = new_tower_grads[0]
+    self.assertEqual(1, first_tower[0][0].shape.ndims)
+    self.assertEqual(26, first_tower[0][0].shape.dims[0])
+    self.assertEqual(packing,
+                     {'0:0': allreduce.GradPackTuple(
+                         indices=range(4),
+                         vars=['v_0_0', 'v_0_1', 'v_0_2', 'v_0_3'],
+                         shapes=[tf.TensorShape([4]),
+                                 tf.TensorShape([4]),
+                                 tf.TensorShape([3, 3,]),
+                                 tf.TensorShape([3, 3,])]),
+                      '1:0': allreduce.GradPackTuple(
+                          indices=range(4),
+                          vars=['v_1_0', 'v_1_1', 'v_1_2', 'v_1_3'],
+                          shapes=[tf.TensorShape([4]),
+                                  tf.TensorShape([4]),
+                                  tf.TensorShape([3, 3,]),
+                                  tf.TensorShape([3, 3,])]),
+                      '2:0': allreduce.GradPackTuple(
+                          indices=range(4),
+                          vars=['v_2_0', 'v_2_1', 'v_2_2', 'v_2_3'],
+                          shapes=[tf.TensorShape([4]),
+                                  tf.TensorShape([4]),
+                                  tf.TensorShape([3, 3,]),
+                                  tf.TensorShape([3, 3,])])})
+
+  def testUnpackSmallTensors(self):
+    packing = {'0:0': allreduce.GradPackTuple(indices=range(2),
+                                              vars=['v_0_0', 'v_0_1'],
+                                              shapes=[tf.TensorShape([4]),
+                                                      tf.TensorShape([4])]),
+               '0:1': allreduce.GradPackTuple(indices=range(3, 5),
+                                              vars=['v_0_3', 'v_0_4'],
+                                              shapes=[tf.TensorShape([3, 3,]),
+                                                      tf.TensorShape([3, 3,])]),
+               '1:0': allreduce.GradPackTuple(indices=range(2),
+                                              vars=['v_1_0', 'v_1_1'],
+                                              shapes=[tf.TensorShape([4]),
+                                                      tf.TensorShape([4])]),
+               '1:1': allreduce.GradPackTuple(indices=range(3, 5),
+                                              vars=['v_1_3', 'v_1_4'],
+                                              shapes=[tf.TensorShape([3, 3,]),
+                                                      tf.TensorShape([3, 3,])])}
+    t0 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7], dtype=tf.float32)
+    t1 = tf.constant([17, 17], dtype=tf.float32)
+    t2 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, 8,
+                      0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
+    t3 = tf.constant([0], dtype=tf.float32)
+    tower_grads = []
+    for d in range(0, 2):
+      one_tower = [(t0, 'packing_var_placeholder'),
+                   (t2, 'packing_var_placeholder'),
+                   (t1, 'v_%d_2' % d), (t3, 'v_%d_5' %d)]
+      tower_grads.append(one_tower)
+    new_tower_grads = allreduce.unpack_small_tensors(tower_grads, packing)
+    self.assertEqual(2, len(new_tower_grads))
+    for d, tg in enumerate(new_tower_grads):
+      self.assertEqual(6, len(tg))
+      self.assertEqual('v_%d_0' % d, tg[0][1])
+      self.assertEqual('v_%d_1' % d, tg[1][1])
+      self.assertEqual('v_%d_2' % d, tg[2][1])
+      self.assertEqual('v_%d_3' % d, tg[3][1])
+      self.assertEqual('v_%d_4' % d, tg[4][1])
+      self.assertEqual('v_%d_5' % d, tg[5][1])
+      self.assertEqual(1, tg[0][0].shape.ndims)
+      self.assertEqual(4, tg[0][0].shape.dims[0])
+      self.assertEqual(1, tg[1][0].shape.ndims)
+      self.assertEqual(4, tg[1][0].shape.dims[0])
+      self.assertEqual(1, tg[2][0].shape.ndims)
+      self.assertEqual(2, tg[2][0].shape.dims[0])
+      self.assertEqual(2, tg[3][0].shape.ndims)
+      self.assertEqual(3, tg[3][0].shape.dims[0])
+      self.assertEqual(3, tg[3][0].shape.dims[1])
+      self.assertEqual(2, tg[4][0].shape.ndims)
+      self.assertEqual(3, tg[4][0].shape.dims[0])
+      self.assertEqual(3, tg[4][0].shape.dims[1])
+      self.assertEqual(1, tg[5][0].shape.ndims)
+      self.assertEqual(1, tg[5][0].shape.dims[0])
+
+
+class DynamicPackingTest(test_util.TensorFlowTestCase):
+  """Packing/Unpacking tests that require executing a TensorFlow session."""
+
+  def _init_tensors(self, num_towers, tensor_shapes):
+    """Construct a collection of tensors across multiple devices."""
+    num_tensors = len(tensor_shapes)
+    consts = []
+    tensors = []
+    vrbls = []
+    tower_grads = []
+    tf.Variable([-1], dtype=tf.int32, name='packing_var_placeholder')
+    for dev_idx in range(0, num_towers):
+      devname = '/job:localhost/device:GPU:%d' % dev_idx
+      consts.append([])
+      tensors.append([])
+      vrbls.append([])
+      with tf.device(devname):
+        base_value = 0
+        gv_tuples = []
+        for t_idx in range(0, num_tensors):
+          shape = tensor_shapes[t_idx]
+          num_elts = 0
+          for d in shape:
+            num_elts = (num_elts or 1) * d
+          c = np.fromiter(range(base_value, base_value + num_elts),
+                          dtype=np.float32).reshape(shape)
+          base_value += num_elts
+          consts[dev_idx].append(c)
+          tensors[dev_idx].append(tf.constant(c))
+          vrbls[dev_idx].append(
+              tf.Variable(c, name='v_d%d_t%d' % (dev_idx, t_idx)))
+          gv_tuples.append((tensors[dev_idx][-1], vrbls[dev_idx][-1]))
+        tower_grads.append(gv_tuples)
+    return tower_grads, consts, tensors, vrbls
+
+  _test_tuple = pycoll.namedtuple('_test_tuple',
+                                  'num_devices, in_shapes out_shapes out_i')
+
+  def _do_pack_unpack_test(self, tt):
+    """Do a single pack-unpack test.
+
+    Args:
+      tt: A _test_tuple defining the parameters of the test to do.
+
+    This test executes a graph that performs a pack of tower_grads
+    followed by an unpack and verifies that the shapes and values
+    of gradient tensors are unchanged, along with paired variables.
+    """
+    with ops.Graph().as_default():
+      tower_grads, consts, _, vrbls = self._init_tensors(
+          tt.num_devices, tt.in_shapes)
+      packed_tg, packing = allreduce.pack_small_tensors(
+          tower_grads, max_bytes=40, max_group=10)
+      unpacked_tg = allreduce.unpack_small_tensors(packed_tg, packing)
+      with self.test_session() as sess:
+        sess.run(variables.global_variables_initializer())
+        packed = sess.run(packed_tg)
+        for d in range(0, tt.num_devices):
+          for t in range(0, len(tt.out_shapes)):
+            num_elts = 0
+            for dim in tt.out_shapes[t]:
+              num_elts = (num_elts or 1) * dim
+            self.assertTrue(np.array_equal(
+                np.array(range(tt.out_i[t], tt.out_i[t] + num_elts),
+                         dtype=np.float32).reshape(tt.out_shapes[t]),
+                packed[d][t][0]))
+        unpacked = sess.run(unpacked_tg)
+        for d in range(0, tt.num_devices):
+          for t in range(0, len(tt.in_shapes)):
+            self.assertTrue(np.array_equal(consts[d][t], unpacked[d][t][0]))
+            self.assertEqual(vrbls[d][t], unpacked_tg[d][t][1])
+
+  def testPackUnpack0(self):
+    self._do_pack_unpack_test(
+        self._test_tuple(num_devices=3,
+                         in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
+                         out_shapes=[[17], [12], [5, 5, 5]],
+                         out_i=[0, 17, 29]))
+
+  def testPackUnpack1(self):
+    self._do_pack_unpack_test(
+        self._test_tuple(num_devices=4,
+                         in_shapes=[[5, 5, 5], [2, 3], [5]],
+                         out_shapes=[[11], [5, 5, 5]],
+                         out_i=[125, 0]))
+
+  def testPackUnpack2(self):
+    self._do_pack_unpack_test(
+        self._test_tuple(num_devices=2,
+                         in_shapes=[[5, 5, 5], [2, 3], [1, 5], [7], [100]],
+                         out_shapes=[[18], [5, 5, 5], [100]],
+                         out_i=[125, 0, 143]))
+
+  def _do_all_reduce_pack_test(self, tt):
+    """Test that all-reduce results are the same with or without packing."""
+    with ops.Graph().as_default():
+      tower_grads, consts, _, _ = self._init_tensors(
+          tt.num_devices, tt.in_shapes)
+      dev_prefixes = ['/job:localhost']
+      num_workers = 1
+      alg = 'xring'
+      shards = 1
+      single_session = True
+      gpu_indices = range(0, tt.num_devices)
+      assert len(gpu_indices) == len(tower_grads)
+      no_pack_all_reduce = allreduce.sum_gradients_all_reduce(
+          single_session,
+          dev_prefixes, tower_grads, num_workers, alg, shards,
+          gpu_indices,
+          agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
+      packed_tg, packing = allreduce.pack_small_tensors(tower_grads, 100, 100)
+      packed_all_reduce = allreduce.sum_gradients_all_reduce(
+          single_session,
+          dev_prefixes, packed_tg, num_workers, alg, shards,
+          gpu_indices,
+          agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
+      unpacked_tg = allreduce.unpack_small_tensors(packed_all_reduce, packing)
+      with self.test_session() as sess:
+        sess.run(variables.global_variables_initializer())
+        no_pack_values = sess.run(no_pack_all_reduce)
+        pack_unpack_values = sess.run(unpacked_tg)
+        for d in range(1, tt.num_devices):
+          for t in range(0, len(tt.in_shapes)):
+            self.assertTrue(np.allclose(no_pack_values[d][t][0],
+                                        tt.num_devices * consts[0][t]))
+            self.assertTrue(np.array_equal(no_pack_values[d][t][0],
+                                           pack_unpack_values[d][t][0]))
+
+  def testAllReducePacked0(self):
+    self._do_all_reduce_pack_test(
+        self._test_tuple(num_devices=3,
+                         in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
+                         out_shapes=[[17], [12], [5, 5, 5]],
+                         out_i=[0, 17, 29]))
+
+  def testAllReducePacked1(self):
+    self._do_all_reduce_pack_test(
+        self._test_tuple(num_devices=2,
+                         in_shapes=[[8], [3, 3], [12], [5, 5, 5], [3], [4]],
+                         out_shapes=[[17], [7], [12], [5, 5, 5]],
+                         out_i=[0, 17, 29, 154, 157]))
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/batch_allreduce.py b/cv/classification/resnet50/tensorflow/batch_allreduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..e36a39ed45b143302724cd7d5b6a9f2d5c952dad
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/batch_allreduce.py
@@ -0,0 +1,628 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains classes and functions for doing a single-machine batch all-reduce.
+
+An all-reduce is taking the reduction (typically a sum) of a list of tensors,
+each on a different device. The result must end up back on each device, which is
+where the word "all" comes from. In summary, each device starts with a single
+tensor, and ends up with the reduction of all tensors.
+
+A batch all-reduce is doing several independent all-reduces. When doing a batch
+all-reduce, care is taken to evenly distribute the reduction computations
+across devices and inter-device tensor transfers across device links.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(reedwm): Support distributed all-reduces in this file.
+# TODO(reedwm): Merge this code with allreduce.py, which contains some batch
+# all-reduce code that this file calls. allreduce.py also supports distributed
+# batch-reduce while this file only supports single-machine all-reduce.
+
+import abc
+
+import six
+import tensorflow.compat.v1 as tf
+
+from tensorflow.python.ops import data_flow_ops
+import allreduce
+import constants
+
+
+def _all_reduce_using_copy(tensors_across_devices, use_mean):
+  """Does an all-reduce of a list of tensors by copying to the current device.
+
+  The tensors are copied to the current device and then reduced.
+
+  Args:
+    tensors_across_devices: A list of tensors, each on a different device.
+    use_mean: Whether to take the mean of the tensors instead of a sum:
+  Returns:
+    A reduced tensor on the current device.
+  """
+  reduced_tensor = tf.add_n(tensors_across_devices)
+  if use_mean:
+    reduced_tensor *= 1 / len(tensors_across_devices)
+  return reduced_tensor
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BatchAllReduceAlgorithm(object):
+  """Represents an algorithm for performing a batch all-reduce operation."""
+
+  def batch_all_reduce(self,
+                       all_device_tensors,
+                       num_splits,
+                       compact_tensors,
+                       defer_tensors,
+                       xla_compile=False):
+    """Performs a batch all-reduce.
+
+    The reduction done is a sum.
+
+    `all_device_tensors` is a list of list of tensors that will be batch
+    all-reduced. All tensors within a single inner list must be on the same
+    device. The nth element in each list, for any n, will be reduced together.
+    The return value is in the same form as `all_device_tensors`, except that
+    each tensor is reduced.
+
+    For example, if `all_device_tensors` is:
+    [[ A,  B  ],     # A and B are on GPU 0
+     [ C,  D  ]]     # C and D are on GPU 1
+
+    Then the return value will be:
+    [[ A+C,  B+D ],  # These two tensors are on GPU 0
+     [ A+C,  B+D ]]  # These two tensors are on GPU 1
+
+    Arguments:
+      all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
+        is a tensor where `i` is the device index and `j` is the tensor index.
+      num_splits: If not None, tensors will be concatenated and split into this
+        many pieces during the all-reduce, then split back into their original
+        shapes afterwards. Has no impact on correctness and can improve
+        performance. Requires all tensors to be the same type.
+      compact_tensors: If True, tensors are casted to fp16 before being all-
+        reduced. Improves performance, but hurts numerical stability.
+      defer_tensors: If True, every time the return value
+        `reduced_all_device_tensors` is evaluated, the result will be the
+        reduced tensors values of `all_device_tensors` from the previous session
+        run instead of the current session run, or zero on the first session
+        run. This can improve performance. When training neural networks,
+        deferring gradients often does not harm training, so this can be used to
+        improve performance.
+      xla_compile: If True, use XLA to compile gradients packing and unpacking
+        ops.
+
+    Returns:
+      reduced_all_device_tensors: A list in the same form as
+        `all_device_tensors`, except each tensor has been reduced.
+      warmup_ops: A list of ops needed to be run once before the all-reduce can
+        occur.
+    """
+
+    # Before all-reducing tensors, we do several preprocessing functions that
+    # can speed up the all-reduce. We undo these functions after all-reducing
+    # the tensors.
+
+    # all_device_packed_tensors is a 2-d list of tensors indexed by
+    # [device_id][tensor_id], holding packed tensors from all devices involved
+    # in all-reduce.
+    all_device_packed_tensors = []
+
+    # all_device_warmup_ops is a 2-d list of ops indexed by
+    # [device_id][tensor_id], holding warmup_ops that need to be run once before
+    # all-reduce can occur.
+    all_device_warmup_ops = []
+
+    # all_device_put_ops is a 2-d list of ops indexed by
+    # [device_id][tensor_id], holding put ops for deferred tensors. They will be
+    # called in each all-reduce step automatically due to control dependency.
+    all_device_put_ops = []
+
+    # packers is a list of _TensorPacker, one for each device involved in
+    # all-reduce.
+    packers = [
+        _TensorPacker(num_splits, compact_tensors) for _ in all_device_tensors
+    ]
+
+    for packer, device_tensors in zip(packers, all_device_tensors):
+
+      def pack_single_device_tensors(packer=packer,
+                                     device_tensors=device_tensors):
+        """Pack gradient tensors of a device."""
+        packed_tensors = packer.maybe_concat_tensors(device_tensors)
+        packed_tensors = packer.maybe_compact_tensors(packed_tensors)
+        # When xla_compile=False, defer tensors after concat for better
+        # performance.
+        if defer_tensors and not xla_compile:
+          packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
+              packed_tensors)
+          all_device_put_ops.append(put_ops)
+          all_device_warmup_ops.append(warmup_ops)
+        packed_tensors = packer.maybe_split_tensors(packed_tensors)
+        return packed_tensors
+
+      with tf.device(device_tensors[0].device):
+        if xla_compile:
+          packed_tensors = tf.xla.experimental.compile(
+              pack_single_device_tensors)
+          # When xla_compile=True, intermediate tensors in packing process are
+          # not materialized. Thus, we defer tensors after packing process is
+          # completed instead of in the middle of it.
+          if defer_tensors:
+            packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
+                packed_tensors)
+            all_device_put_ops.append(put_ops)
+            all_device_warmup_ops.append(warmup_ops)
+        else:
+          packed_tensors = pack_single_device_tensors()
+
+      all_device_packed_tensors.append(packed_tensors)
+
+    # Perform all-reduce on packed tensors.
+    all_device_tensors = self._do_batch_all_reduce(all_device_packed_tensors)
+
+    all_device_unpacked_tensors = []
+    for packer, device_tensors in zip(packers, all_device_tensors):
+
+      def unpack_single_device_tensors(packer=packer,
+                                       device_tensors=device_tensors):
+        """Unpack gradient tensors of a device."""
+        unpacked_tensors = packer.undo_maybe_split_tensors(device_tensors)
+        unpacked_tensors = packer.undo_maybe_compact_tensors(unpacked_tensors)
+        unpacked_tensors = packer.undo_maybe_concat_tensors(unpacked_tensors)
+        return unpacked_tensors
+
+      with tf.device(device_tensors[0].device):
+        if xla_compile:
+          unpacked_device_tensor = tf.xla.experimental.compile(
+              unpack_single_device_tensors)
+        else:
+          unpacked_device_tensor = unpack_single_device_tensors()
+
+      all_device_unpacked_tensors.append(unpacked_device_tensor)
+
+    # Note: There is no undo operation for deferring tensors. But we do need to
+    # call _add_put_op_control_deps at the end if we deferred the tensors.
+    if defer_tensors:
+      all_device_unpacked_tensors = _add_put_op_control_deps(
+          all_device_unpacked_tensors, num_splits, all_device_put_ops)
+
+    return all_device_unpacked_tensors, all_device_warmup_ops
+
+  @abc.abstractmethod
+  def _do_batch_all_reduce(self, all_device_tensors):
+    """Performs a batch all-reduce.
+
+    Unlike `self.batch_all_reduce`, this does not do any preprocessing of the
+    tensors.
+
+    Args:
+      all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
+        is a tensor where `i` is the device index and `j` is the tensor index.
+    Returns:
+      reduced_all_device_tensors: A list in the same form as
+        `all_device_tensors`, except each tensor has been reduced.
+    """
+    pass
+
+
+class CopyToDeviceAlgorithm(BatchAllReduceAlgorithm):
+  """An algorithm that copies tensors to be reduced to a specific device."""
+
+  def __init__(self, devices_to_reduce_on, use_mean=False):
+    self._devices = devices_to_reduce_on
+    self._use_mean = use_mean
+
+  def _do_batch_all_reduce(self, all_device_tensors):
+    reduced_tensors = []
+    for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
+      with tf.device(self._devices[i % len(self._devices)]):
+        reduced_tensor = _all_reduce_using_copy(tensors_across_devices,
+                                                self._use_mean)
+        reduced_tensors.append(reduced_tensor)
+    # The tensors will be brought back to each device once they are used.
+    return [reduced_tensors] * len(all_device_tensors)
+
+
+class HierarchicalCopyAlgorithm(BatchAllReduceAlgorithm):
+  """An algorithm that uses hierarchical copies. This is only optimized for
+  eight devices connected in NetworkTopology.DGX1 or NetworkTopology.GCP_V100
+  topology.
+  """
+
+  def __init__(self, network_topology):
+    """Initializer for HierarchicalCopyAlgorithm.
+
+    Args:
+      network_topology: An instance of Enum class constants.NetworkTopology.
+    """
+    self._network_topology = network_topology
+
+  def _do_batch_all_reduce(self, all_device_tensors):
+    avail_devices = [device_tensors[0].device
+                     for device_tensors in all_device_tensors]
+    reduced_tensors = []
+    num_devices = len(avail_devices)
+    group_size = num_devices // 2
+    for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
+      group_0_main_device, group_1_main_device = self.__get_main_devices(
+          i, num_devices)
+      if group_0_main_device < group_size:
+        group_0_begin = 0
+        group_1_begin = group_size
+      else:
+        group_0_begin = group_size
+        group_1_begin = 0
+
+      # Reduce the first group.
+      group_0_tensors = tensors_across_devices[group_0_begin:
+                                               group_0_begin + group_size]
+      with tf.device(avail_devices[group_0_main_device]):
+        group_0_reduced_tensor = _all_reduce_using_copy(group_0_tensors, False)
+
+      # Reduce the second group.
+      group_1_tensors = tensors_across_devices[group_1_begin:
+                                               group_1_begin + group_size]
+      with tf.device(avail_devices[group_1_main_device]):
+        group_1_reduced_tensor = _all_reduce_using_copy(group_1_tensors, False)
+
+      # Reduce between the groups.
+      with tf.device(avail_devices[group_0_main_device]):
+        total_reduced_tensor = _all_reduce_using_copy(
+            [group_0_reduced_tensor, group_1_reduced_tensor], False)
+
+      # Broadcast the result back into the root of each group.
+      with tf.device(avail_devices[group_0_main_device]):
+        group_0_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
+      with tf.device(avail_devices[group_1_main_device]):
+        group_1_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
+
+      reduced_tensors_bcast = []
+      for j in range(len(tensors_across_devices)):
+        with tf.device(avail_devices[j]):
+          # Broadcast the result back to each member in the group from the root.
+          if (group_0_main_device < group_size) == (j < group_size):
+            src_device_tensor = group_0_reduced_tensor_bcast
+          else:
+            src_device_tensor = group_1_reduced_tensor_bcast
+          reduced_tensors_bcast.append(tf.identity(src_device_tensor))
+
+      reduced_tensors.append(reduced_tensors_bcast)
+
+    reduced_tensors = list(zip(*reduced_tensors))
+    return reduced_tensors
+
+  def __get_main_devices(self, tensor_index, num_devices):
+    """Returns the pair of main devices to use for initial reduction.
+
+    Args:
+      tensor_index: Index of the current tensor in the list of tensors to copy.
+      num_devices: Total number of devices.
+
+    Returns:
+      A tuple containing pair of main device indices for the initial
+      reduction. Then, the first element of the tuple should be used for the
+      final reduction.
+
+    Raises:
+      ValueError: Invalid input arguments.
+    """
+    if self._network_topology == constants.NetworkTopology.DGX1:
+      return tensor_index % num_devices, (tensor_index +
+                                          (num_devices // 2)) % num_devices
+    elif self._network_topology == constants.NetworkTopology.GCP_V100:
+      if num_devices != 8:
+        raise ValueError('HierarchicalCopy only supports eight devices in %s.' %
+                         self._network_topology)
+      # TODO(hinsu): Generalize main device indices to handle any other
+      # isomorphic connection graph that connects two cliques using connections
+      # other than 0-5 and 2-7.
+      main_device_pairs = [(0, 5), (2, 7), (5, 0), (7, 2)]
+      return main_device_pairs[tensor_index % len(main_device_pairs)]
+    else:
+      # TODO(reedwm): make this logic more general for arbitrary topology.
+      raise ValueError(
+          'HierarchicalCopy is not supported for %s network topology.' %
+          self._network_topology)
+
+
+class AllReduceSpecAlgorithm(BatchAllReduceAlgorithm):
+  """An algorithm that uses an all reduce spec."""
+
+  def __init__(self, all_reduce_spec, gpu_indices, agg_small_grads_max_bytes,
+               agg_small_grads_max_group):
+    spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
+    if len(spec) != 1:
+      raise ValueError(
+          'Replicated mode does not support hybrid all-reduce strategies')
+    self._all_reduce_spec = spec[0]
+    self._gpu_indices = gpu_indices
+    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self._agg_small_grads_max_group = agg_small_grads_max_group
+
+  def _do_batch_all_reduce(self, all_device_tensors):
+    # TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other
+    # gradient aggregation code, since gradient aggregation is doing an all
+    # reduce. Currently, we do gradient repacking in two different places.
+    # TODO(reedwm): Change the allreduce code to reduce tensors instead of
+    # tower_grads.
+    tower_grads = [[(t, None) for t in device_tensors]
+                   for device_tensors in all_device_tensors]
+    aggregated_device_grads = allreduce.sum_gradients_all_reduce(
+        False,  # single_session
+        ['/job:localhost'],
+        tower_grads,
+        1,
+        self._all_reduce_spec.alg,
+        self._all_reduce_spec.shards,
+        self._gpu_indices,
+        agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
+        agg_small_grads_max_group=self._agg_small_grads_max_group)
+    return [[t for t, _ in grad_vars] for grad_vars in aggregated_device_grads]
+
+
+def algorithm_from_params(params):
+  """Returns a BatchAllReduceAlgorithm from a Params tuple."""
+  if params.all_reduce_spec:
+    if params.gpu_indices:
+      gpu_indices = [int(x) for x in params.gpu_indices.split(',')]
+    else:
+      gpu_indices = [x for x in range(params.num_gpus)]
+    return AllReduceSpecAlgorithm(params.all_reduce_spec, gpu_indices,
+                                  params.agg_small_grads_max_bytes,
+                                  params.agg_small_grads_max_group)
+  elif params.hierarchical_copy:
+    return HierarchicalCopyAlgorithm(params.network_topology)
+  else:
+    if params.local_parameter_device == 'gpu':
+      devices_to_reduce_on = ['/gpu:%d' % i for i in range(params.num_gpus)]
+    else:
+      devices_to_reduce_on = ['/cpu:0']
+    return CopyToDeviceAlgorithm(devices_to_reduce_on)
+
+
+def _apply_to_all_device_tensors(all_device_tensors, apply_func, colocate=True):
+  """Applies a function to each tensor in `all_device_tensors`.
+
+  A new list of lists of tensors is returned, where every tensor in
+  `all_device_tensors` has had `apply_func` called on it. `all_device_tensors`
+  is not modified.
+
+  Args:
+    all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
+      a tensor where `i` is the device index and `j` is the tensor index.
+    apply_func: A function taking in three arguments: tensor, device_index,
+      tensor_index, and returning a modified tensor.
+      `tensor` is `all_device_tensors[device_index][tensor_index]`.
+    colocate: If True, apply_func will be run under context manager colocated
+      with it's input tensor.
+  Returns:
+    A list in the same form as `all_device_tensors`, except each tensor has had
+    `apply_func` called on it.
+  """
+  new_all_device_tensors = []
+  for device_index, device_tensors in enumerate(all_device_tensors):
+    new_device_tensors = []
+    for tensor_index, t in enumerate(device_tensors):
+      if colocate:
+        with tf.colocate_with(t):
+          new_t = apply_func(t, device_index, tensor_index)
+      else:
+        new_t = apply_func(t, device_index, tensor_index)
+      new_device_tensors.append(new_t)
+    new_all_device_tensors.append(new_device_tensors)
+  return new_all_device_tensors
+
+
+def _defer_tensor(tensor):
+  """Defers the retrieval of a tensor.
+
+  The tensor is put into a StagingArea, and the return value is the
+  retrieval of the tensor from the StagingArea. The effect is that the
+  tensor returned from this function is the tensor that was put in the
+  StagingArea for the previous Session.run() call.
+
+  Args:
+    tensor: The tensor to defer for one step.
+
+  Returns:
+    deferred_tensor: The tensor deferred for one step.
+    put_op: An op to put `tensor` in the StagingArea. Must be run every step
+      that `deferred_tensor` is run.
+    warmup_op: A warmup op that should be called before the first step. Puts
+      a zero tensor into the StagingArea.
+  """
+  tensor_stage = data_flow_ops.StagingArea([tensor.dtype], [tensor.shape])
+  put_op = tensor_stage.put([tensor])
+  warmup_op = tensor_stage.put([tf.zeros(tensor.shape, dtype=tensor.dtype)])
+
+  # Fetch the next tensor to use.
+  (tensor,) = tensor_stage.get()
+  return tensor, put_op, warmup_op
+
+
+def defer_single_device_tensors(device_tensors):
+  """Defer tensors (gradients in this case) from a single device.
+
+  Arguments:
+    device_tensors: A list of gradients tensors from a single device to defer.
+
+  Returns:
+    deferred_tensors: A list of tensors deferred for one step.
+    put_ops: A list of ops that put `tensors` in the StagingAreas. Must be run
+      every step that `deferred_tensors` is run.
+    warmup_ops: Warmup ops that should be called before the first step. Puts
+      zero tensors into the StagingArea.
+  """
+  put_ops = []
+  warmup_ops = []
+  deferred_tensors = []
+
+  for tensor in device_tensors:
+    deferred_tensor, put_op, warmup_op = _defer_tensor(tensor)
+    deferred_tensors.append(deferred_tensor)
+    put_ops.append(put_op)
+    warmup_ops.append(warmup_op)
+
+  return deferred_tensors, put_ops, warmup_ops
+
+
+def _add_put_op_control_deps(all_device_tensors, num_splits, put_ops):
+  """Add control dependencies from `put_ops` to `all_device_tensors`.
+
+  This should only be called when deferred tensors are being used.
+
+  The control dependencies are added so that the put ops are run whenever
+  `all_device_tensors` is run. That way, the caller does not have to explicitly
+  run the put ops.
+
+  Args:
+    all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
+      a tensor where `i` is the device index and `j` is the tensor index.
+    num_splits: The number of splits that were used for the all-reduce.
+    put_ops: A list of put ops from deferring the tensors.
+  Returns:
+    A list in the same form as `all_device_tensors`, except each tensor has a
+    control dependency on an op in `put_ops`.
+
+  """
+  def apply_func(tensor, device_index, tensor_index):
+    if num_splits == 0:
+      deps = [put_ops[device_index][tensor_index]]
+    else:
+      deps = put_ops[device_index]
+    assert len(deps) == 1
+    with tf.control_dependencies(deps):
+      return tf.identity(tensor, name='control_dependency')
+  return _apply_to_all_device_tensors(all_device_tensors, apply_func)
+
+
+class _TensorPacker(object):
+  """Packs and unpacks tensors into groups.
+
+  This class first concatenates a set of tensors, then split the concatenated
+  tensor into a small number of chunks. This is useful for all-reducing tensors,
+  as doing a small number of all-reduces on large tensors can be faster than
+  doing a large number of all-reduces on small tensors.
+
+  It also provides option to compact tensors by casting them to fp16, for better
+  all-reduce performance.
+
+  This class maintains states of processed tensors like shapes and types. So
+  each packer can only be used to pack and unpack one list of tensors. If you
+  need to pack multiple lists of tensors (say from multiple devices), then you
+  need multiple _TensorPacker object, one for each device.
+  """
+
+  def __init__(self, num_splits, compact):
+    """Initializes the _TensorPacker.
+
+    Arguments:
+      num_splits: The number of tensors to split the concatenated tensor into.
+        The batch all-reduce will consist of `num_splits` all-reduces. if None
+        or zero, tensors are not split or concatenated.
+      compact: If True, tensors are casted to fp16 during packing and casted
+        back to their original dtypes during unpacking.
+    """
+    self._num_splits = num_splits
+    self._compact = compact
+    self._before_compact_dtypes = []
+
+  def maybe_concat_tensors(self, device_tensors):
+    """Concatenate tensors into a single tensor."""
+    if not self._num_splits:
+      return device_tensors
+
+    flat_tensors = [tf.reshape(t, [-1]) for t in device_tensors]
+    self._orig_shapes = [t.shape for t in device_tensors]
+    self._orig_sizes = [s.num_elements() for s in self._orig_shapes]
+    # All shapes must be fully defined.
+    assert None not in self._orig_sizes
+    concatenated_grad = tf.concat(flat_tensors, 0)
+    return [concatenated_grad]
+
+  def maybe_split_tensors(self, concatenated_tensor):
+    """Split concatenated tensor into `num_splits` pieces."""
+    if not self._num_splits:
+      return concatenated_tensor
+
+    if len(concatenated_tensor) != 1:
+      raise RuntimeError('tensors must be concatenated via '
+                         'maybe_concat_tensors() before splitting')
+
+    concatenated_tensor = concatenated_tensor[0]
+    total_tensor_size = concatenated_tensor.shape.num_elements()
+    split_size = total_tensor_size // self._num_splits
+    split_size_last = total_tensor_size - split_size * (self._num_splits - 1)
+    split_sizes = [split_size] * (self._num_splits - 1) + [split_size_last]
+    tensor_packs = tf.split(concatenated_tensor, split_sizes)
+    return tensor_packs
+
+  def undo_maybe_split_tensors(self, tensor_packs):
+    """Undo maybe_split_tensors()."""
+    if not self._num_splits:
+      return tensor_packs
+
+    return [tf.concat(tensor_packs, 0)]
+
+  def undo_maybe_concat_tensors(self, concatenated_tensor):
+    """Undo maybe_concat_tensors()."""
+    if not self._num_splits:
+      return concatenated_tensor
+
+    if len(concatenated_tensor) != 1:
+      raise RuntimeError(
+          'undo_maybe_split_tensors() must be called before '
+          'undo_maybe_concat_tensors when num_splits is greater than 1')
+    concatenated_tensor = concatenated_tensor[0]
+
+    tensors_with_sizes = tf.split(concatenated_tensor,
+                                  self._orig_sizes)
+    tensors_with_shapes = [
+        tf.reshape(grad, shape) for grad, shape in zip(
+            tensors_with_sizes, self._orig_shapes)
+    ]
+    return tensors_with_shapes
+
+  def maybe_compact_tensors(self, device_tensors):
+    """Cast tensors to fp16 and store their original types."""
+    if not self._compact:
+      return device_tensors
+
+    if self._before_compact_dtypes:
+      raise RuntimeError('maybe_compact_tensors can only be called once.')
+
+    self._before_compact_dtypes = [t.dtype for t in device_tensors]
+    compact_tensors = [tf.cast(t, tf.float16) for t in device_tensors]
+
+    return compact_tensors
+
+  def undo_maybe_compact_tensors(self, compact_tensors):
+    """Undo maybe_compact_tensors()."""
+    if not self._compact:
+      return compact_tensors
+
+    if not self._before_compact_dtypes:
+      raise RuntimeError('maybe_compact_tensors() must be called before '
+                         'undo_maybe_compact_tensors()')
+
+    device_tensors = [
+        tf.cast(t, dtype)
+        for t, dtype in zip(compact_tensors, self._before_compact_dtypes)
+    ]
+    return device_tensors
diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn.py b/cv/classification/resnet50/tensorflow/benchmark_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f65ea69b46f479a649c81aaddc797f30809c1ae
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/benchmark_cnn.py
@@ -0,0 +1,3554 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow benchmark library.
+
+See the README for more information.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+from collections import namedtuple
+import contextlib
+import math
+import multiprocessing
+import os
+import re
+import threading
+import time
+import traceback
+
+from absl import flags as absl_flags
+import numpy as np
+
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+import cnn_util
+import constants
+import datasets
+import flags
+import mlperf
+import variable_mgr
+import variable_mgr_util
+from cnn_util import log_fn
+from models import model_config
+from platforms import util as platforms_util
+from google.protobuf import text_format
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import debug as tf_debug
+from tensorflow.python.client import timeline
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import graph_util_impl
+from tensorflow.python.framework import importer
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.util import nest
+
+
+_DEFAULT_NUM_BATCHES = 100
+
+
+# GraphInfo encapsulates the tensors/ops that we care about after building a
+# graph. We use them to benchmark the graph.
+GraphInfo = namedtuple(  # pylint: disable=invalid-name
+    'GraphInfo',
+    [
+        # Ops that produce the input batches (before preprocessing).
+        'input_producer_op',
+        # Ops that adds the preprocessed images to the staging areas
+        'enqueue_ops',
+        # Fetches of sess.run()
+        'fetches',
+        # Op that performs synchronization in distributed mode
+        'execution_barrier',
+        # The global step variable
+        'global_step',
+        # Group of ops that perform per-device initialization work
+        'local_var_init_op_group',
+        # Op to produce summaries
+        'summary_op'
+    ])
+
+
+# InputProcessingInfo contains various sources of inputs which will be later fed
+# into the model. If synthetic data is used, all three fields are None.
+InputProcessingInfo = namedtuple(
+    'InputProcessingInfo',
+    [
+        # The first two fields are non-None iff datasets prefetching is not
+        # used.
+
+        # Ops that produce the input batches.
+        'input_producer_op',
+        # A list of StagingArea for each device.
+        'input_producer_stages',
+
+        # Input produced using multi device iterator. Non-None iff datasets
+        # prefetching is used
+        'multi_device_iterator_input'
+    ])
+
+
+# A string specifying the npy file postfix for saving predicted logits.
+flags.DEFINE_string('save_dir', '.', 'The dir to which the predicted logits npy file will be saved.')
+
+# TODO(reedwm): add upper_bound and lower_bound to appropriate integer and
+# float flags, and change certain string flags to enum flags.
+
+flags.DEFINE_string('model', 'trivial',
+                    'Name of the model to run, the list of supported models '
+                    'are defined in models/model.py')
+# The code will first check if it's running under benchmarking mode
+# or evaluation mode, depending on 'eval':
+# Under the evaluation mode, this script will read a saved model,
+#   and compute the accuracy of the model against a validation dataset.
+#   Additional ops for accuracy and top_k predictors are only used under
+#   this mode.
+# Under the benchmarking mode, user can specify whether nor not to use
+#   the forward-only option, which will only compute the loss function.
+#   forward-only cannot be enabled with eval at the same time.
+flags.DEFINE_boolean('eval', False, 'whether use eval or benchmarking')
+flags.DEFINE_integer('eval_interval_secs', 0,
+                     'How often to run eval on saved checkpoints. Usually the '
+                     'same as save_model_secs from the corresponding training '
+                     'run. Pass 0 to eval only once.')
+flags.DEFINE_integer('eval_during_training_every_n_steps', None,
+                     'Every n steps during training, pause training, run '
+                     'evaluation, then resume training. Must not be used with '
+                     '--eval, as unlike --eval, this option causes both '
+                     'training and eval to be done. This may take slightly '
+                     'more GPU memory than running just training or evaluation '
+                     'alone. It also may slightly slow down training, even '
+                     'when not taking into account the additional time to '
+                     'evaluate.', lower_bound=1)
+flags.DEFINE_float('eval_during_training_every_n_epochs', None,
+                   'After every n training epochs, pause training, run '
+                   'evaluation, then resume training. See '
+                   '--eval_during_training_every_n_steps for more information.')
+flags.DEFINE_list('eval_during_training_at_specified_steps', [],
+                  'Specify a list of training steps, pause training at each of '
+                  'these steps, run evaluation, then resume training. See '
+                  '--eval_during_training_every_n_steps for more information.')
+flags.DEFINE_list('eval_during_training_at_specified_epochs', [],
+                  'Specify a list of training epochs, pause training after '
+                  'each of these epochs, run evaluation, then resume training. '
+                  'See --eval_during_training_every_n_steps for more '
+                  'information.')
+flags.DEFINE_boolean('forward_only', False,
+                     'whether use forward-only or training for benchmarking')
+flags.DEFINE_boolean('freeze_when_forward_only', False,
+                     'whether to freeze the graph when in forward-only mode.')
+flags.DEFINE_boolean('print_training_accuracy', False,
+                     'whether to calculate and print training accuracy during '
+                     'training')
+flags.DEFINE_integer('batch_size', 0, 'batch size per compute device')
+flags.DEFINE_integer('eval_batch_size', 0, 'eval batch size per compute device')
+flags.DEFINE_integer('batch_group_size', 1,
+                     'number of groups of batches processed in the image '
+                     'producer.')
+flags.DEFINE_integer('num_batches', None, 'number of batches to run, excluding '
+                     'warmup. Defaults to %d' % _DEFAULT_NUM_BATCHES)
+flags.DEFINE_integer('num_eval_batches', None,
+                     'number of eval batches to run, excluding warmup. '
+                     'Defaults to --num_batches')
+flags.DEFINE_float('num_epochs', 90,
+                   'number of epochs to run, excluding warmup. '
+                   'This and --num_batches cannot both be specified.')
+flags.DEFINE_float('num_eval_epochs', None,
+                   'number of eval epochs to run, excluding warmup. '
+                   'Defaults to --num_epochs')
+flags.DEFINE_float('stop_at_top_1_accuracy', None,
+                   'If set, stops training after the evaluation accuracy hits '
+                   'this number. Can only be used with one of the '
+                   '--eval_during_training_* flags.')
+flags.DEFINE_boolean('collect_eval_results_async', False,
+                     'If True, start a separate process to postprocess eval '
+                     'results asynchronously. This currently only works with '
+                     'the SSD model.')
+flags.DEFINE_integer('num_warmup_batches', None,
+                     'number of batches to run before timing')
+flags.DEFINE_integer('autotune_threshold', None,
+                     'The autotune threshold for the models')
+# TODO(tucker): change num_gpus to num_devices
+flags.DEFINE_integer('num_gpus', 1, 'the number of GPUs to run on')
+flags.DEFINE_string('gpu_indices', '', 'indices of worker GPUs in ring order')
+flags.DEFINE_integer('display_every', 10,
+                     'Number of local steps after which progress is printed '
+                     'out')
+flags.DEFINE_float('display_perf_ewma', None,
+                   'If set, display numbers of images/sec using exponentially '
+                   'weighted moving avearge with the specified weight, which '
+                   'defines how much current value contributes to the reported '
+                   'average. Increasing weight makes the reported performance '
+                   'number reflect more about the real-time speed instead of '
+                   'the entire history', lower_bound=0, upper_bound=1)
+flags.DEFINE_string('data_dir', None,
+                    'Path to dataset in TFRecord format (aka Example '
+                    'protobufs). If not specified, synthetic data will be '
+                    'used.')
+flags.DEFINE_string('data_name', None,
+                    'Name of dataset: imagenet or cifar10. If not specified, '
+                    'it is automatically guessed based on data_dir.')
+flags.DEFINE_string('resize_method', 'bilinear',
+                    'Method for resizing input images: crop, nearest, '
+                    'bilinear, bicubic, area, or round_robin. The `crop` mode '
+                    'requires source images to be at least as large as the '
+                    'network input size. The `round_robin` mode applies '
+                    'different resize methods based on position in a batch in '
+                    'a round-robin fashion. Other modes support any sizes and '
+                    'apply random bbox distortions before resizing (even with '
+                    'distortions=False).')
+flags.DEFINE_boolean('distortions', False,
+                     'Enable/disable distortions during image preprocessing. '
+                     'These include bbox and color distortions.')
+flags.DEFINE_boolean('use_datasets', True,
+                     'Enable use of datasets for input pipeline')
+flags.DEFINE_string('input_preprocessor', 'default',
+                    'Name of input preprocessor. The list of supported input '
+                    'preprocessors are defined in preprocessing.py.')
+flags.DEFINE_string('gpu_thread_mode', 'gpu_private',
+                    'Methods to assign GPU host work to threads. '
+                    'global: all GPUs and CPUs share the same global threads; '
+                    'gpu_private: a private threadpool for each GPU; '
+                    'gpu_shared: all GPUs share the same threadpool.')
+flags.DEFINE_integer('per_gpu_thread_count', 0,
+                     'The number of threads to use for GPU. Only valid when '
+                     'gpu_thread_mode is not global.')
+flags.DEFINE_boolean('hierarchical_copy', False,
+                     'Use hierarchical copies. Currently only optimized for '
+                     'use on a DGX-1 with 8 GPUs and may perform poorly on '
+                     'other hardware. Requires --num_gpus > 1, and only '
+                     'recommended when --num_gpus=8')
+# TODO(hinsu): Support auto-detection of the network topology while still
+# retaining the ability to specify a particular topology for debugging.
+flags.DEFINE_enum(
+    'network_topology', constants.NetworkTopology.DGX1,
+    (constants.NetworkTopology.DGX1, constants.NetworkTopology.GCP_V100),
+    'Network topology specifies the topology used to connect multiple devices. '
+    'Network topology is used to decide the hierarchy to use for the '
+    'hierarchical_copy.')
+flags.DEFINE_integer('gradient_repacking', 0, 'Use gradient repacking. It'
+                     'currently only works with replicated mode. At the end of'
+                     'of each step, it repacks the gradients for more efficient'
+                     'cross-device transportation. A non-zero value specifies'
+                     'the number of split packs that will be formed.',
+                     lower_bound=0)
+flags.DEFINE_boolean('compact_gradient_transfer', True, 'Compact gradient'
+                     'as much as possible for cross-device transfer and '
+                     'aggregation.')
+flags.DEFINE_enum('variable_consistency', 'strong', ('strong', 'relaxed'),
+                  'The data consistency for trainable variables. With strong '
+                  'consistency, the variable always have the updates from '
+                  'previous step. With relaxed consistency, all the updates '
+                  'will eventually show up in the variables. Likely one step '
+                  'behind.')
+flags.DEFINE_boolean('datasets_repeat_cached_sample', False,
+                     'Enable use of a special datasets pipeline that reads a '
+                     'single TFRecord into memory and repeats it infinitely '
+                     'many times. The purpose of this flag is to make it '
+                     'possible to write regression tests that are not '
+                     'bottlenecked by CNS throughput. '
+                     'Use datasets_use_caching to cache input data.')
+flags.DEFINE_enum('local_parameter_device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
+                  'Device to use as parameter server: cpu or gpu. For '
+                  'distributed training, it can affect where caching of '
+                  'variables happens.')
+flags.DEFINE_enum('device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
+                  'Device to use for computation: cpu or gpu')
+flags.DEFINE_enum('data_format', 'NCHW', ('NHWC', 'NCHW'),
+                  'Data layout to use: NHWC (TF native) or NCHW (cuDNN '
+                  'native, requires GPU).')
+flags.DEFINE_integer('num_intra_threads', None,
+                     'Number of threads to use for intra-op parallelism. If '
+                     'set to 0, the system will pick an appropriate number. '
+                     'None is the same as 0 except that it disables intra-op '
+                     'parallelism on a GPU.')
+flags.DEFINE_integer('num_inter_threads', 0,
+                     'Number of threads to use for inter-op parallelism. If '
+                     'set to 0, the system will pick an appropriate number.')
+flags.DEFINE_boolean('use_numa_affinity', False,
+                     'Whether to turn on NUMA affinity for CPU devices. '
+                     'This is probably only useful when --device=cpu.')
+flags.DEFINE_string('trace_file', '',
+                    'Enable TensorFlow tracing and write trace to this file.')
+flags.DEFINE_boolean('use_chrome_trace_format', True,
+                     'If True, the trace_file, if specified, will be in a '
+                     'Chrome trace format. If False, then it will be a '
+                     'StepStats raw proto.')
+flags.DEFINE_boolean('use_deep_stem', False,
+                     'If True, use deep stem style (replace 7*7 conv to 3 3*3 conv) '
+                     'Resnet model only')
+_NUM_STEPS_TO_PROFILE = 10
+_NUM_OPS_TO_PRINT = 20
+flags.DEFINE_string('tfprof_file', None,
+                    'If specified, write a tfprof ProfileProto to this file. '
+                    'The performance and other aspects of the model can then '
+                    'be analyzed with tfprof. See '
+                    'https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/g3doc/command_line.md '  # pylint: disable=line-too-long
+                    'for more info on how to do this. The first %d steps '
+                    'are profiled. Additionally, the top %d most time '
+                    'consuming ops will be printed.\n'
+                    'Note: profiling with tfprof is very slow, but most of the '
+                    'overhead is spent between steps. So, profiling results '
+                    'are more accurate than the slowdown would suggest.' %
+                    (_NUM_STEPS_TO_PROFILE, _NUM_OPS_TO_PRINT))
+flags.DEFINE_string('graph_file', None,
+                    'Write the model\'s graph definition to this file. '
+                    'Defaults to binary format unless filename ends in "txt".')
+flags.DEFINE_string('partitioned_graph_file_prefix', None,
+                    'If specified, after the graph has been partitioned and '
+                    'optimized, write out each partitioned graph to a file '
+                    'with the given prefix.')
+flags.DEFINE_enum('optimizer', 'sgd', ('momentum', 'sgd', 'rmsprop', 'adam'),
+                  'Optimizer to use')
+flags.DEFINE_float('init_learning_rate', None,
+                   'Initial learning rate for training.')
+flags.DEFINE_string('piecewise_learning_rate_schedule', None,
+                    'Specifies a piecewise learning rate schedule based on the '
+                    'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, '
+                    'where each LRi is a learning rate and each Ei is an epoch '
+                    'indexed from 0. The learning rate is LRi if the '
+                    'E(i-1) <= current_epoch < Ei. For example, if this '
+                    'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 '
+                    'for the first 10 epochs, then is 0.2 for the next 15 '
+                    'epochs, then is 0.1 until training ends.')
+flags.DEFINE_float('num_epochs_per_decay', 0,
+                   'Steps after which learning rate decays. If 0, the learning '
+                   'rate does not decay.')
+flags.DEFINE_float('learning_rate_decay_factor', 0,
+                   'Learning rate decay factor. Decay by this factor every '
+                   '`num_epochs_per_decay` epochs. If 0, learning rate does '
+                   'not decay.')
+flags.DEFINE_float('num_learning_rate_warmup_epochs', 0,
+                   'Slowly increase to the initial learning rate in the first '
+                   'num_learning_rate_warmup_epochs linearly.')
+flags.DEFINE_float('minimum_learning_rate', 0,
+                   'The minimum learning rate. The learning rate will '
+                   'never decay past this value. Requires `learning_rate`, '
+                   '`num_epochs_per_decay` and `learning_rate_decay_factor` to '
+                   'be set.')
+flags.DEFINE_float('resnet_base_lr', None, "Base learning rate at bs=256. Only "
+                   "relevant when training ResNet and utilizing the model's "
+                   "learning rate heuristic (get_learning_rate).")
+flags.DEFINE_float('momentum', 0.9, 'Momentum for training.')
+flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')
+flags.DEFINE_float('rmsprop_momentum', 0.9, 'Momentum in RMSProp.')
+flags.DEFINE_float('rmsprop_epsilon', 1.0, 'Epsilon term for RMSProp.')
+flags.DEFINE_float('adam_beta1', 0.9, 'Beta2 term for the Adam optimizer')
+flags.DEFINE_float('adam_beta2', 0.999, 'Beta2 term for the Adam optimizer')
+flags.DEFINE_float('adam_epsilon', 1e-8, 'Epsilon term for the Adam optimizer')
+flags.DEFINE_float('gradient_clip', None,
+                   'Gradient clipping magnitude. Disabled by default.')
+flags.DEFINE_float('weight_decay', 0.00004,
+                   'Weight decay factor for training.')
+flags.DEFINE_float('gpu_memory_frac_for_testing', 0,
+                   'If non-zero, the fraction of GPU memory that will be used. '
+                   'Useful for testing the benchmark script, as this allows '
+                   'distributed mode to be run on a single machine. For '
+                   'example, if there are two tasks, each can be allocated '
+                   '~40 percent of the memory on a single machine. This is '
+                   'also useful for using unified memory, as this can be set '
+                   'above 1 to oversubscribe the GPU using unified memory.',
+                   lower_bound=0.)
+flags.DEFINE_boolean('use_unified_memory', None,
+                     'If True, allocate unified memory enabling larger models '
+                     'to fit in available device RAM.')
+flags.DEFINE_boolean('timestamped_allocator', False,
+                     'If True marks free BFCAllocator::Chunks with time '
+                     'at which they are freed which can allow more efficient '
+                     'memory allocation in cases like RDMA networking.')
+flags.DEFINE_integer('gpu_kt_max_interval', 0,
+                     'If > 0, the maximum number of GPU Ops that may be queued '
+                     'in a row without also queuing a tracking event.')
+flags.DEFINE_integer('gpu_kt_max_bytes', 0,
+                     'If > 0, the maximum number of bytes '
+                     'of GPU memory that may be allocated by sequential '
+                     'GPU Ops without queuing a tracking event.')
+flags.DEFINE_integer('gpu_kt_max_pending', 0,
+                     'If > 0 no more than this many GPU tracking events may be '
+                     'outstanding at any time.  When this limit is reached '
+                     'launch of additional kernels will stall until an '
+                     'outstanding event completes.')
+flags.DEFINE_boolean('use_tf_layers', True,
+                     'If True, use tf.layers for neural network layers. This '
+                     'should not affect performance or accuracy in any way.')
+flags.DEFINE_integer('tf_random_seed', 1234,
+                     'The TensorFlow random seed. Useful for debugging NaNs, '
+                     'as this can be set to various values to see if the NaNs '
+                     'depend on the seed.')
+flags.DEFINE_string('debugger', None,
+                    'If set, use the TensorFlow debugger. If set to "cli", use '
+                    'the local CLI debugger. Otherwise, this must be in the '
+                    'form hostname:port (e.g., localhost:7007) in which case '
+                    'the experimental TensorBoard debugger will be used')
+flags.DEFINE_boolean('use_python32_barrier', False,
+                     'When on, use threading.Barrier at Python 3.2.')
+
+flags.DEFINE_boolean('ml_perf', False,
+                     'When True, change how the Imagenet input pipeline works '
+                     'slightly to meet the MLPerf compliance rules. This slows '
+                     'down the input pipeline. Without this option, at the end '
+                     'of the input pipeline, the image is divided by 127.5, '
+                     'then 1.0 is subtracted from it, bringing the image '
+                     'values from [0, 255] to [-1.0, 1.0]. With this option, '
+                     'each of the three channels (red, green, blue) have the '
+                     'average channel value among all image subtracted from '
+                     'it, and no division is done.')
+
+flags.DEFINE_boolean('datasets_use_prefetch', True,
+                     'Enable use of prefetched datasets for input pipeline. '
+                     'This option is meaningless if use_datasets=False.')
+flags.DEFINE_integer('datasets_prefetch_buffer_size', 1,
+                     'Prefetching op buffer size per compute device.')
+flags.DEFINE_integer('datasets_num_private_threads', None,
+                     'Number of threads for a private threadpool created for '
+                     'all datasets computation. By default, we pick an '
+                     'appropriate number. If set to 0, we use the default '
+                     'tf-Compute threads for dataset operations.')
+flags.DEFINE_boolean('datasets_use_caching', False,
+                     'Cache the compressed input data in memory. This improves '
+                     'the data input performance, at the cost of additional '
+                     'memory.')
+flags.DEFINE_integer('datasets_parallel_interleave_cycle_length', None,
+                     'Number of parallel file readers interleaving input data.')
+flags.DEFINE_boolean('datasets_sloppy_parallel_interleave', False,
+                     'Allow parallel interleave to depart from deterministic '
+                     'ordering, by temporarily skipping over files whose '
+                     'elements are not readily available. This can increase '
+                     'througput in particular in the presence of stragglers.')
+flags.DEFINE_integer('datasets_parallel_interleave_prefetch', None,
+                     'The number of input elements to fetch before they are '
+                     'needed for interleaving.')
+
+flags.DEFINE_integer(
+    'multi_device_iterator_max_buffer_size', 1,
+    'Configuration parameter for the MultiDeviceIterator that '
+    ' specifies the host side buffer size for each device.')
+
+# Performance tuning parameters.
+flags.DEFINE_boolean('winograd_nonfused', True,
+                     'Enable/disable using the Winograd non-fused algorithms.')
+flags.DEFINE_boolean(
+    'batchnorm_persistent', True,
+    'Enable/disable using the CUDNN_BATCHNORM_SPATIAL_PERSISTENT '
+    'mode for batchnorm.')
+flags.DEFINE_boolean('sync_on_finish', False,
+                     'Enable/disable whether the devices are synced after each '
+                     'step.')
+flags.DEFINE_boolean('staged_vars', False,
+                     'whether the variables are staged from the main '
+                     'computation')
+flags.DEFINE_boolean('force_gpu_compatible', False,
+                     'whether to enable force_gpu_compatible in GPU_Options')
+flags.DEFINE_boolean('allow_growth', None,
+                     'whether to enable allow_growth in GPU_Options')
+flags.DEFINE_boolean('xla', False, 'whether to enable XLA auto-jit compilation')
+flags.DEFINE_boolean('xla_compile', False,
+                     'Enable xla to compile the graph. Uncompilable ops will '
+                     'result in fatal errors.')
+flags.DEFINE_boolean('fuse_decode_and_crop', True,
+                     'Fuse decode_and_crop for image preprocessing.')
+flags.DEFINE_boolean('distort_color_in_yiq', True,
+                     'Distort color of input images in YIQ space.')
+flags.DEFINE_boolean('enable_optimizations', True,
+                     'Whether to enable grappler and other optimizations.')
+flags.DEFINE_string('rewriter_config', None,
+                    'Config for graph optimizers, described as a '
+                    'RewriterConfig proto buffer.')
+flags.DEFINE_enum('loss_type_to_report', 'total_loss',
+                  ('base_loss', 'total_loss'),
+                  'Which type of loss to output and to write summaries for. '
+                  'The total loss includes L2 loss while the base loss does '
+                  'not. Note that the total loss is always used while '
+                  'computing gradients during training if weight_decay > 0, '
+                  'but explicitly computing the total loss, instead of just '
+                  'computing its gradients, can have a performance impact.')
+flags.DEFINE_boolean('single_l2_loss_op', False,
+                     'If True, instead of using an L2 loss op per variable, '
+                     'concatenate the variables into a single tensor and do a '
+                     'single L2 loss on the concatenated tensor.')
+flags.DEFINE_boolean('use_resource_vars', False,
+                     'Use resource variables instead of normal variables. '
+                     'Resource variables are slower, but this option is useful '
+                     'for debugging their performance.')
+flags.DEFINE_boolean('compute_lr_on_cpu', False,
+                     'If True, do computations related to learning rate on the '
+                     'CPU instead of the GPU. This will significantly improve '
+                     'XLA performance in some cases.')
+flags.DEFINE_boolean('sparse_to_dense_grads', False,
+                     'If True, convert all sparse gradients to dense gradients '
+                     'before passing them to the optimizer to update '
+                     'variables. Only affects models with sparse gradients, '
+                     'which currently is only the NCF model.')
+# Performance tuning specific to MKL.
+flags.DEFINE_boolean('mkl', False, 'If true, set MKL environment variables.')
+flags.DEFINE_integer('kmp_blocktime', 0,
+                     'The time, in milliseconds, that a thread should wait, '
+                     'after completing the execution of a parallel region, '
+                     'before sleeping')
+flags.DEFINE_string('kmp_affinity', 'granularity=fine,verbose,compact,1,0',
+                    'Restricts execution of certain threads (virtual execution '
+                    'units) to a subset of the physical processing units in a '
+                    'multiprocessor computer.')
+flags.DEFINE_integer('kmp_settings', 1,
+                     'If set to 1, MKL settings will be printed.')
+
+# fp16 parameters. If use_fp16=False, no other fp16 parameters apply.
+flags.DEFINE_boolean('use_fp16', False,
+                     'Use 16-bit floats for certain tensors instead of 32-bit '
+                     'floats. This is currently experimental.')
+# TODO(reedwm): The default loss scale of 128 causes most models to diverge
+# on the second step with synthetic data. Changing the tf.set_random_seed
+# call to tf.set_random_seed(1235) or most other seed values causes the
+# issue not to occur.
+flags.DEFINE_float('fp16_loss_scale', None,
+                   'If fp16 is enabled, the loss is multiplied by this amount '
+                   'right before gradients are computed, then each gradient '
+                   'is divided by this amount. Mathematically, this has no '
+                   'effect, but it helps avoid fp16 underflow. Set to 1 to '
+                   'effectively disable. Ignored during eval.')
+flags.DEFINE_boolean('fp16_vars', False,
+                     'If fp16 is enabled, also use fp16 for variables. If '
+                     'False, the variables are stored in fp32 and casted to '
+                     'fp16 when retrieved.  Recommended to leave as False.')
+flags.DEFINE_boolean('fp16_enable_auto_loss_scale', False,
+                     'If True and use_fp16 is True, automatically adjust the '
+                     'loss scale during training.')
+flags.DEFINE_integer('fp16_inc_loss_scale_every_n', 1000,
+                     'If fp16 is enabled and fp16_enable_auto_loss_scale is '
+                     'True, increase the loss scale every n steps.')
+
+# The method for managing variables:
+#   parameter_server: variables are stored on a parameter server that holds
+#       the master copy of the variable. In local execution, a local device
+#       acts as the parameter server for each variable; in distributed
+#       execution, the parameter servers are separate processes in the
+#       cluster.
+#       For each step, each tower gets a copy of the variables from the
+#       parameter server, and sends its gradients to the param server.
+#   replicated: each GPU has its own copy of the variables. To apply
+#       gradients, an all_reduce algorithm or or regular cross-device
+#       aggregation is used to replicate the combined gradients to all
+#       towers (depending on all_reduce_spec parameter setting).
+#   independent: each GPU has its own copy of the variables, and gradients
+#       are not shared between towers. This can be used to check performance
+#       when no data is moved between GPUs.
+#   distributed_replicated: Distributed training only. Each GPU has a copy
+#       of the variables, and updates its copy after the parameter servers
+#       are all updated with the gradients from all servers. Only works with
+#       cross_replica_sync=true. Unlike 'replicated', currently never uses
+#       nccl all-reduce for replicating within a server.
+#   distributed_all_reduce: Distributed training where all replicas run
+#       in a single session, using all-reduce to mutally reduce the
+#       gradients.  Uses no parameter servers.  When there is only one
+#       worker, this is the same as replicated.
+#   collective_all_reduce: Distributed training where all replicas run
+#       independepently except for variable initialization and for
+#       gradient reduction which is done via collective all-reduce.
+#       NOTE: collective_all_reduce in conjunction with use_fp16 can
+#       lead to NaNs in some models (resnet50).  TODO(tucker): fix it.
+#   horovod: Distributed training using Horovod library. Runs workers using
+#       an MPI framework (e.g. Open MPI). Each worker runs training on
+#       single GPU, and averages gradients using NCCL or MPI all-reduce.
+#       See https://github.com/uber/horovod for more details.
+flags.DEFINE_enum('variable_update', 'parameter_server',
+                  ('parameter_server', 'replicated', 'distributed_replicated',
+                   'independent', 'distributed_all_reduce',
+                   'collective_all_reduce', 'horovod'),
+                  'The method for managing variables: parameter_server, '
+                  'replicated, distributed_replicated, independent, '
+                  'distributed_all_reduce, collective_all_reduce, horovod')
+flags.DEFINE_string('all_reduce_spec', None,
+                    'A specification of the all_reduce algorithm to be used '
+                    'for reducing gradients.  For more details, see '
+                    'parse_all_reduce_spec in variable_mgr.py.  An '
+                    'all_reduce_spec has BNF form:\n'
+                    'int ::= positive whole number\n'
+                    'g_int ::= int[KkMGT]?\n'
+                    'alg_spec ::= alg | alg#int\n'
+                    'range_spec ::= alg_spec | alg_spec/alg_spec\n'
+                    'spec ::= range_spec | range_spec:g_int:range_spec\n'
+                    'NOTE: not all syntactically correct constructs are '
+                    'supported.\n\n'
+                    'Examples:\n '
+                    '"xring" == use one global ring reduction for all '
+                    'tensors\n'
+                    '"pscpu" == use CPU at worker 0 to reduce all tensors\n'
+                    '"nccl" == use NCCL to locally reduce all tensors.  '
+                    'Limited to 1 worker.\n'
+                    '"nccl/xring" == locally (to one worker) reduce values '
+                    'using NCCL then ring reduce across workers.\n'
+                    '"pscpu:32k:xring" == use pscpu algorithm for tensors of '
+                    'size up to 32kB, then xring for larger tensors.')
+
+# If variable_update==distributed_all_reduce then it may be advantageous
+# to aggregate small tensors into one prior to reduction.  These parameters
+# control that aggregation.
+flags.DEFINE_integer('agg_small_grads_max_bytes', 0,
+                     'If > 0, try to aggregate tensors of less than this '
+                     'number of bytes prior to all-reduce.')
+flags.DEFINE_integer('agg_small_grads_max_group', 10,
+                     'When aggregating small tensors for all-reduce do not '
+                     'aggregate more than this many into one new tensor.')
+flags.DEFINE_integer('allreduce_merge_scope', 1,
+                     'Establish a name scope around this many '
+                     'gradients prior to creating the all-reduce operations. '
+                     'It may affect the ability of the backend to merge '
+                     'parallel ops.')
+
+# Distributed training parameters.
+flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''),
+                  'One of "ps", "worker", "controller", "".  Empty for local '
+                  'training')
+flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts')
+flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts')
+flags.DEFINE_string('controller_host', None, 'optional controller host')
+flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
+flags.DEFINE_string('server_protocol', 'grpc', 'protocol for servers')
+flags.DEFINE_boolean('cross_replica_sync', True, '')
+flags.DEFINE_string('horovod_device', '', 'Device to do Horovod all-reduce on: '
+                    'empty (default), cpu or gpu. Default with utilize GPU if '
+                    'Horovod was compiled with the HOROVOD_GPU_ALLREDUCE '
+                    'option, and CPU otherwise.')
+
+# Summary and Save & load checkpoints.
+flags.DEFINE_integer('summary_verbosity', 0, 'Verbosity level for summary ops. '
+                     'level 0: disable any summary.\n'
+                     'level 1: small and fast ops, e.g.: learning_rate, '
+                     'total_loss.\n'
+                     'level 2: medium-cost ops, e.g. histogram of all '
+                     'gradients.\n'
+                     'level 3: expensive ops: images and histogram of each '
+                     'gradient.\n')
+flags.DEFINE_integer('save_summaries_steps', 0,
+                     'How often to save summaries for trained models. Pass 0 '
+                     'to disable summaries.')
+flags.DEFINE_integer('save_model_secs', 0,
+                     'How often to save trained models. Pass 0 to disable '
+                     'saving checkpoints every N seconds. A checkpoint is '
+                     'saved after training completes regardless of this '
+                     'option.')
+flags.DEFINE_integer('save_model_steps', None,
+                     'How often to save trained models. If specified, '
+                     'save_model_secs must not be specified.')
+flags.DEFINE_integer('max_ckpts_to_keep', 5,
+                     'Max number of checkpoints to keep.')
+flags.DEFINE_string('train_dir', None,
+                    'Path to session checkpoints. Pass None to disable saving '
+                    'checkpoint at the end.')
+flags.DEFINE_string('eval_dir', '/tmp/tf_cnn_benchmarks/eval',
+                    'Directory where to write eval event logs.')
+flags.DEFINE_string('backbone_model_path', None,
+                    'Path to pretrained backbone model checkpoint. Pass None '
+                    'if not using a backbone model.')
+flags.DEFINE_enum('trt_mode', '', ['', 'FP32', 'FP16', 'INT8'],
+                  'If this is specified in forward_only mode and '
+                  'freeze_when_forward_only is set to True, use TensorRT to '
+                  'optimize the graph before execution.')
+flags.DEFINE_integer('trt_max_workspace_size_bytes', 4 << 30,
+                     'Max workspace size bytes used by the TensorRT optimizer.')
+
+# Benchmark logging for model garden metric
+flags.DEFINE_string('benchmark_log_dir', None,
+                    'The directory to place the log files containing the '
+                    'results of benchmark. The logs are created by '
+                    'BenchmarkFileLogger. Requires the root of the Tensorflow '
+                    'models repository to be in $PYTHTONPATH.')
+flags.DEFINE_string('benchmark_test_id', None,
+                    'The unique test ID of the benchmark run. It could be the '
+                    'combination of key parameters. It is hardware independent '
+                    'and could be used compare the performance between '
+                    'different test runs. This flag is designed for human '
+                    'consumption, and does not have any impact within the '
+                    'system.')
+
+platforms_util.define_platform_params()
+
+
+class GlobalStepWatcher(threading.Thread):
+  """A helper class for global_step.
+
+  Polls for changes in the global_step of the model, and finishes when the
+  number of steps for the global run are done.
+  """
+
+  def __init__(self, sess, global_step_op, start_at_global_step,
+               end_at_global_step):
+    threading.Thread.__init__(self)
+    self.sess = sess
+    self.global_step_op = global_step_op
+    self.start_at_global_step = start_at_global_step
+    self.end_at_global_step = end_at_global_step
+
+    self.start_time = 0
+    self.start_step = 0
+    self.finish_time = 0
+    self.finish_step = 0
+
+  def run(self):
+    while self.finish_time == 0:
+      time.sleep(.25)
+      global_step_val, = self.sess.run([self.global_step_op])
+      if self.start_time == 0 and global_step_val >= self.start_at_global_step:
+        # Use tf.logging.info instead of log_fn, since print (which is log_fn)
+        # is not thread safe and may interleave the outputs from two parallel
+        # calls to print, which can break tests.
+        tf.logging.info('Starting real work at step %s at time %s' %
+                        (global_step_val, time.ctime()))
+        self.start_time = time.time()
+        self.start_step = global_step_val
+      if self.finish_time == 0 and global_step_val >= self.end_at_global_step:
+        tf.logging.info('Finishing real work at step %s at time %s' %
+                        (global_step_val, time.ctime()))
+        self.finish_time = time.time()
+        self.finish_step = global_step_val
+
+  def done(self):
+    return self.finish_time > 0
+
+  def num_steps(self):
+    return self.finish_step - self.start_step
+
+  def elapsed_time(self):
+    return self.finish_time - self.start_time
+
+
+class CheckpointNotFoundException(Exception):
+  pass
+
+
+def create_config_proto(params):
+  """Returns session config proto.
+
+  Args:
+    params: Params tuple, typically created by make_params or
+            make_params_from_flags.
+  """
+  config = tf.ConfigProto()
+  config.allow_soft_placement = True
+  if params.num_intra_threads is None:
+    if params.device == 'gpu':
+      config.intra_op_parallelism_threads = 1
+  else:
+    config.intra_op_parallelism_threads = params.num_intra_threads
+  config.inter_op_parallelism_threads = params.num_inter_threads
+  config.experimental.collective_group_leader = '/job:worker/replica:0/task:0'
+  config.gpu_options.experimental.collective_ring_order = params.gpu_indices
+  config.gpu_options.force_gpu_compatible = params.force_gpu_compatible
+  config.experimental.use_numa_affinity = params.use_numa_affinity
+  if params.device == 'cpu':
+    # TODO(tucker): change num_gpus to num_devices
+    config.device_count['CPU'] = params.num_gpus
+  if params.allow_growth is not None:
+    config.gpu_options.allow_growth = params.allow_growth
+  if params.gpu_memory_frac_for_testing > 0:
+    config.gpu_options.per_process_gpu_memory_fraction = (
+        params.gpu_memory_frac_for_testing)
+  if params.use_unified_memory:
+    config.gpu_options.experimental.use_unified_memory = (
+        params.use_unified_memory)
+  if params.timestamped_allocator:
+    config.gpu_options.experimental.timestamped_allocator = (
+        params.timestamped_allocator)
+  if params.gpu_kt_max_interval > 0:
+    config.gpu_options.experimental.kernel_tracker_max_interval = (
+        params.gpu_kt_max_interval)
+  if params.gpu_kt_max_bytes > 0:
+    config.gpu_options.experimental.kernel_tracker_max_bytes = (
+        params.gpu_kt_max_bytes)
+  if params.gpu_kt_max_pending > 0:
+    config.gpu_options.experimental.kernel_tracker_max_pending = (
+        params.gpu_kt_max_pending)
+  if params.xla:
+    config.graph_options.optimizer_options.global_jit_level = (
+        tf.OptimizerOptions.ON_1)
+  if params.rewriter_config:
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    text_format.Merge(params.rewriter_config, rewriter_config)
+    config.graph_options.rewrite_options.CopyFrom(rewriter_config)
+  elif not params.enable_optimizations:
+    config.graph_options.optimizer_options.opt_level = tf.OptimizerOptions.L0
+    config.graph_options.rewrite_options.disable_meta_optimizer = True
+  elif params.variable_update == 'collective_all_reduce':
+    rewrite_options = config.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_optimization = (
+        rewriter_config_pb2.RewriterConfig.ON)
+    rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
+  if params.variable_update == 'horovod':
+    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
+    config.gpu_options.visible_device_list = str(hvd.local_rank())
+  # For collective_all_reduce, ignore all devices except current worker.
+  if params.variable_update == 'collective_all_reduce':
+    del config.device_filters[:]
+    config.device_filters.append(
+        '/job:%s/replica:0/task:%d' % (params.job_name, params.task_index))
+
+  # TODO(b/117324590): Re-enable PinToHostOptimizer when b/117324590 is fixed.
+  # Currently we have to disable PinToHostOptimizer w/ XLA since it causes
+  # OOM/perf cliffs.
+  config.graph_options.rewrite_options.pin_to_host_optimization = (
+      rewriter_config_pb2.RewriterConfig.OFF)
+  return config
+
+
+def get_mode_from_params(params):
+  """Returns the mode in which this script is running.
+
+  Args:
+    params: Params tuple, typically created by make_params or
+            make_params_from_flags.
+  Raises:
+    ValueError: Unsupported params settings.
+  """
+  if params.forward_only and params.eval:
+    raise ValueError('Only one of forward_only and eval parameters is true')
+
+  if params.eval:
+    return constants.BenchmarkMode.EVAL
+  elif params.forward_only:
+    return constants.BenchmarkMode.FORWARD_ONLY
+  elif (params.eval_during_training_every_n_steps or
+        params.eval_during_training_every_n_epochs or
+        params.eval_during_training_at_specified_steps or
+        params.eval_during_training_at_specified_epochs):
+    return constants.BenchmarkMode.TRAIN_AND_EVAL
+  else:
+    return constants.BenchmarkMode.TRAIN
+
+
+# How many digits to show for the loss and accuracies during training.
+LOSS_AND_ACCURACY_DIGITS_TO_SHOW = 3
+
+
+def benchmark_one_step(sess,
+                       fetches,
+                       step,
+                       batch_size,
+                       step_train_times,
+                       trace_filename,
+                       partitioned_graph_file_prefix,
+                       profiler,
+                       image_producer,
+                       params,
+                       summary_op=None,
+                       show_images_per_sec=True,
+                       benchmark_logger=None,
+                       collective_graph_key=0):
+  """Advance one step of benchmarking."""
+  should_profile = profiler and 0 <= step < _NUM_STEPS_TO_PROFILE
+  need_options_and_metadata = (
+      should_profile or collective_graph_key > 0 or
+      ((trace_filename or partitioned_graph_file_prefix) and step == -2)
+  )
+  if need_options_and_metadata:
+    run_options = tf.RunOptions()
+    if (trace_filename and step == -2) or should_profile:
+      run_options.trace_level = tf.RunOptions.FULL_TRACE
+    if partitioned_graph_file_prefix and step == -2:
+      run_options.output_partition_graphs = True
+    if collective_graph_key > 0:
+      run_options.experimental.collective_graph_key = collective_graph_key
+    run_metadata = tf.RunMetadata()
+  else:
+    run_options = None
+    run_metadata = None
+  summary_str = None
+  start_time = time.time()
+  if summary_op is None:
+    results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
+  else:
+    (results, summary_str) = sess.run(
+        [fetches, summary_op], options=run_options, run_metadata=run_metadata)
+
+  if not params.forward_only:
+    lossval = results['average_loss']
+  else:
+    lossval = 0.
+  if image_producer is not None:
+    image_producer.notify_image_consumption()
+  train_time = time.time() - start_time
+  step_train_times.append(train_time)
+  if (show_images_per_sec and step >= 0 and
+      (step == 0 or (step + 1) % params.display_every == 0)):
+    speed_mean, speed_uncertainty, speed_jitter = get_perf_timing(
+        batch_size, step_train_times, params.display_perf_ewma)
+    log_str = '%i\t%s\t%.*f' % (
+        step + 1,
+        get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter),
+        LOSS_AND_ACCURACY_DIGITS_TO_SHOW, lossval)
+    if 'top_1_accuracy' in results:
+      log_str += '\t%.*f\t%.*f' % (
+          LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_1_accuracy'],
+          LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_5_accuracy'])
+    log_fn(log_str)
+    if benchmark_logger:
+      benchmark_logger.log_metric(
+          'current_examples_per_sec', speed_mean, global_step=step + 1)
+      if 'top_1_accuracy' in results:
+        benchmark_logger.log_metric(
+            'top_1_accuracy', results['top_1_accuracy'], global_step=step + 1)
+        benchmark_logger.log_metric(
+            'top_5_accuracy', results['top_5_accuracy'], global_step=step + 1)
+  if need_options_and_metadata:
+    if should_profile:
+      profiler.add_step(step, run_metadata)
+    if trace_filename and step == -2:
+      log_fn('Dumping trace to %s' % trace_filename)
+      trace_dir = os.path.dirname(trace_filename)
+      if not gfile.Exists(trace_dir):
+        gfile.MakeDirs(trace_dir)
+      with gfile.Open(trace_filename, 'w') as trace_file:
+        if params.use_chrome_trace_format:
+          trace = timeline.Timeline(step_stats=run_metadata.step_stats)
+          trace_file.write(trace.generate_chrome_trace_format(show_memory=True))
+        else:
+          trace_file.write(str(run_metadata.step_stats))
+    if partitioned_graph_file_prefix and step == -2:
+      path, filename = os.path.split(partitioned_graph_file_prefix)
+      if '.' in filename:
+        base_filename, ext = filename.rsplit('.', 1)
+        ext = '.' + ext
+      else:
+        base_filename, ext = filename, ''
+      as_text = filename.endswith('txt')
+      for graph_def in run_metadata.partition_graphs:
+        device = graph_def.node[0].device.replace('/', '_').replace(':', '_')
+        graph_filename = '%s%s%s' % (base_filename, device, ext)
+        log_fn('Writing partitioned GraphDef as %s to %s' % (
+            'text' if as_text else 'binary',
+            os.path.join(path, graph_filename)))
+        tf.train.write_graph(graph_def, path, graph_filename, as_text)
+  return (summary_str, lossval)
+
+
+def get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter, scale=1):
+  if scale == 1:
+    # TODO(laigd): rename 'images' to maybe 'inputs', same below.
+    return ('images/sec: %.1f +/- %.1f (jitter = %.1f)' %
+            (speed_mean, speed_uncertainty, speed_jitter))
+  else:
+    return 'images/sec: %.1f' % speed_mean
+
+
+def get_perf_timing(batch_size, step_train_times, ewma_alpha=None, scale=1):
+  """Calculate benchmark processing speed."""
+  times = np.array(step_train_times)
+  speeds = batch_size / times
+  if ewma_alpha:
+    weights = np.logspace(len(times)-1, 0, len(times), base=1-ewma_alpha)
+    time_mean = np.average(times, weights=weights)
+  else:
+    time_mean = np.mean(times)
+  speed_mean = scale * batch_size / time_mean
+  speed_uncertainty = np.std(speeds) / np.sqrt(float(len(speeds)))
+  speed_jitter = 1.4826 * np.median(np.abs(speeds - np.median(speeds)))
+  return speed_mean, speed_uncertainty, speed_jitter
+
+
+def load_checkpoint(saver, sess, ckpt_dir):
+  """Loads checkpoint from provided directory or full path.
+
+  Args:
+    saver: Saver used to restore the checkpoint.
+    sess: TensorFlow session.
+    ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
+
+  Returns:
+    Global step.
+  """
+  model_checkpoint_path = _get_checkpoint_to_load(ckpt_dir)
+  global_step = model_checkpoint_path.split('/')[-1].split('-')[-1]
+  if not global_step.isdigit():
+    global_step = 0
+  else:
+    global_step = int(global_step)
+  saver.restore(sess, model_checkpoint_path)
+  log_fn('Successfully loaded model from %s.' % model_checkpoint_path)
+  return global_step
+
+
+def _get_checkpoint_to_load(ckpt_dir):
+  """Returns which checkpoint to load.
+
+  Args:
+    ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
+
+  Returns:
+    Full path to checkpoint to load.
+
+  Raises:
+    CheckpointNotFoundException: If checkpoint is not found.
+  """
+  p = re.compile(r'ckpt-\d+$')
+  if p.search(ckpt_dir):
+    model_checkpoint_path = ckpt_dir
+  else:
+    # Finds latest checkpoint in directory provided
+    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
+    if ckpt and ckpt.model_checkpoint_path:
+      model_checkpoint_path = ckpt.model_checkpoint_path
+    else:
+      raise CheckpointNotFoundException('No checkpoint file found in dir:{}'.
+                                        format(ckpt_dir))
+  return model_checkpoint_path
+
+
+# Params are passed to BenchmarkCNN's constructor. Params is a map from name
+# to value, with one field per key in flags.param_specs.
+#
+# Call make_params() or make_params_from_flags() below to construct a Params
+# tuple with default values from flags.param_specs, rather than constructing
+# Params directly.
+Params = namedtuple('Params', flags.param_specs.keys())  # pylint: disable=invalid-name
+
+
+def validate_params(params):
+  """Validates that the Params tuple had valid values.
+
+  When command-line flags are defined for each ParamSpec by calling
+  flags.define_flags(), calling this function is unnecessary because absl
+  already does flag validation. Otherwise, this function should be called.
+
+  Args:
+     params: A Params tuple.
+  Raises:
+    ValueError: An element of params had an invalid value.
+  """
+  for name, value in params._asdict().items():
+    param_spec = flags.param_specs[name]
+    if param_spec.flag_type in ('integer', 'float'):
+      if (value is not None and param_spec.kwargs['lower_bound'] is not None and
+          value < param_spec.kwargs['lower_bound']):
+        raise ValueError('Param %s value of %s is lower than the lower bound '
+                         'of %s' %
+                         (name, value, param_spec.kwargs['lower_bound']))
+      if (value is not None and param_spec.kwargs['upper_bound'] is not None and
+          param_spec.kwargs['upper_bound'] < value):
+        raise ValueError('Param %s value of %s is higher than the upper bound '
+                         'of %s' %
+                         (name, value, param_spec.kwargs['upper_bound']))
+    elif (value is not None and param_spec.flag_type == 'enum' and
+          value not in param_spec.kwargs['enum_values']):
+      raise ValueError('Param %s of value %s is not in %s'%
+                       (name, value, param_spec.kwargs['enum_values']))
+
+
+def make_params(**kwargs):
+  """Create a Params tuple for BenchmarkCNN from kwargs.
+
+  Default values are filled in from flags.param_specs.
+
+  Args:
+    **kwargs: kwarg values will override the default values.
+  Returns:
+    Params namedtuple for constructing BenchmarkCNN.
+  """
+  # Create a (name: default_value) map from flags.param_specs.
+  default_kwargs = {
+      name: flags.param_specs[name].default_value
+      for name in flags.param_specs
+  }
+  params = Params(**default_kwargs)._replace(**kwargs)
+  validate_params(params)
+  return params
+
+
+def make_params_from_flags():
+  """Create a Params tuple for BenchmarkCNN from absl_flags.FLAGS.
+
+  Returns:
+    Params namedtuple for constructing BenchmarkCNN.
+  """
+  # Collect (name: value) pairs for absl_flags.FLAGS with matching names in
+  # flags.param_specs.
+  flag_values = {name: getattr(absl_flags.FLAGS, name)
+                 for name in flags.param_specs.keys()}
+  return Params(**flag_values)
+
+
+def remove_param_fields(params, fields_to_remove):
+  """Remove fields from a Params namedtuple."""
+  params_dict = params._asdict()
+  for field in fields_to_remove:
+    assert field in params_dict, 'Invalid Params field: ' + field
+  params_dict = {k: v for k, v in params_dict.items()
+                 if k not in fields_to_remove}
+  new_params_type = namedtuple('Params', params_dict.keys())
+  return new_params_type(**params_dict)
+
+
+def get_num_batches_and_epochs(params, batch_size, num_examples_per_epoch):
+  """Returns the number of batches and epochs to run for.
+
+  Args:
+    params: Params tuple, typically created by make_params or
+      make_params_from_flags.
+    batch_size: The number of images per step.
+    num_examples_per_epoch: The number of images in a single epoch.
+
+  Returns:
+    num_batches: The number of batches to run for.
+    num_epochs: The number of epochs to run for. This might be slightly
+      smaller than params.num_epochs if specified, because the number of batches
+      must be an integer.
+
+  Raises:
+    ValueError: Invalid or unsupported params.
+  """
+  if params.num_batches and params.num_epochs:
+    raise ValueError('At most one of --num_batches and --num_epochs may be '
+                     'specified.')
+  if params.num_epochs:
+    num_batches = int(params.num_epochs * num_examples_per_epoch +
+                      batch_size - 1) // batch_size
+  else:
+    num_batches = params.num_batches or _DEFAULT_NUM_BATCHES
+  num_epochs = num_batches * batch_size / num_examples_per_epoch
+  return (num_batches, num_epochs)
+
+
+def get_piecewise_learning_rate(piecewise_learning_rate_schedule,
+                                global_step, num_batches_per_epoch):
+  """Returns a piecewise learning rate tensor.
+
+  Args:
+    piecewise_learning_rate_schedule: The --piecewise_learning_rate_schedule
+      parameter
+    global_step: Scalar tensor representing the global step.
+    num_batches_per_epoch: float indicating the number of batches per epoch.
+
+  Returns:
+    A scalar float tensor, representing the learning rate.
+
+  Raises:
+    ValueError: piecewise_learning_rate_schedule is not formatted correctly.
+  """
+  pieces = piecewise_learning_rate_schedule.split(';')
+  if len(pieces) % 2 == 0:
+    raise ValueError('--piecewise_learning_rate_schedule must have an odd '
+                     'number of components')
+  values = []
+  boundaries = []
+  for i, piece in enumerate(pieces):
+    if i % 2 == 0:
+      try:
+        values.append(float(piece))
+      except ValueError:
+        raise ValueError('Invalid learning rate: ' + piece)
+    else:
+      try:
+        boundaries.append(int(int(piece) * num_batches_per_epoch) - 1)
+      except ValueError:
+        raise ValueError('Invalid epoch: ' + piece)
+  return tf.train.piecewise_constant(global_step, boundaries, values,
+                                     name='piecewise_learning_rate')
+
+
+def get_learning_rate(params, global_step, num_examples_per_epoch, model,
+                      batch_size):
+  """Returns a learning rate tensor based on global_step.
+
+  Args:
+    params: Params tuple, typically created by make_params or
+      make_params_from_flags.
+    global_step: Scalar tensor representing the global step.
+    num_examples_per_epoch: The number of examples per epoch.
+    model: The model.Model object to obtain the default learning rate from if no
+      learning rate is specified.
+    batch_size: Number of examples per step
+
+  Returns:
+    A scalar float tensor, representing the learning rate. When evaluated, the
+    learning rate depends on the current value of global_step.
+
+  Raises:
+    ValueError: Invalid or unsupported params.
+  """
+  with tf.name_scope('learning_rate'):
+    num_batches_per_epoch = num_examples_per_epoch / batch_size
+
+    if params.piecewise_learning_rate_schedule:
+      if (params.init_learning_rate is not None or
+          params.learning_rate_decay_factor or
+          params.minimum_learning_rate or params.num_epochs_per_decay):
+        raise ValueError('No other learning rate-related flags can be '
+                         'specified if --piecewise_learning_rate_schedule is '
+                         'specified')
+      learning_rate = get_piecewise_learning_rate(
+          params.piecewise_learning_rate_schedule,
+          global_step, num_batches_per_epoch)
+    elif params.init_learning_rate is not None:
+      learning_rate = params.init_learning_rate
+      if (params.num_epochs_per_decay > 0 and
+          params.learning_rate_decay_factor > 0):
+        decay_steps = int(num_batches_per_epoch * params.num_epochs_per_decay)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        learning_rate = tf.train.exponential_decay(
+            params.init_learning_rate,
+            global_step,
+            decay_steps,
+            params.learning_rate_decay_factor,
+            staircase=True)
+
+        if params.minimum_learning_rate != 0.:
+          learning_rate = tf.maximum(learning_rate,
+                                     params.minimum_learning_rate)
+    else:
+      learning_rate = model.get_learning_rate(global_step, batch_size)
+    if params.num_learning_rate_warmup_epochs > 0 and (
+        params.init_learning_rate is not None or
+        params.piecewise_learning_rate_schedule):
+      warmup_steps = int(num_batches_per_epoch *
+                         params.num_learning_rate_warmup_epochs)
+      init_lr = params.init_learning_rate
+      if init_lr is None:
+        init_lr = float(params.piecewise_learning_rate_schedule.split(';')[0])
+      warmup_lr = init_lr * tf.cast(global_step, tf.float32) / tf.cast(
+          warmup_steps, tf.float32)
+      learning_rate = tf.cond(global_step < warmup_steps,
+                              lambda: warmup_lr, lambda: learning_rate)
+
+    learning_rate = mlperf.logger.log_deferred_tensor_value(
+        mlperf.tags.OPT_LR, learning_rate, global_step, every_n=100)
+  return learning_rate
+
+
+def get_optimizer(params, learning_rate):
+  """Returns the optimizer that should be used based on params."""
+  if params.optimizer == 'momentum':
+    mlperf.logger.log(key=mlperf.tags.OPT_NAME,
+                      value=mlperf.tags.SGD_WITH_MOMENTUM)
+    mlperf.logger.log(key=mlperf.tags.OPT_MOMENTUM, value=params.momentum)
+    opt = tf.train.MomentumOptimizer(
+        learning_rate, params.momentum, use_nesterov=True)
+  elif params.optimizer == 'sgd':
+    mlperf.logger.log(key=mlperf.tags.OPT_NAME, value=mlperf.tags.SGD)
+    opt = tf.train.GradientDescentOptimizer(learning_rate)
+  elif params.optimizer == 'rmsprop':
+    opt = tf.train.RMSPropOptimizer(
+        learning_rate,
+        params.rmsprop_decay,
+        momentum=params.rmsprop_momentum,
+        epsilon=params.rmsprop_epsilon)
+  elif params.optimizer == 'adam':
+    opt = tf.train.AdamOptimizer(learning_rate, params.adam_beta1,
+                                 params.adam_beta2, params.adam_epsilon)
+  else:
+    raise ValueError('Optimizer "{}" was not recognized'.
+                     format(params.optimizer))
+  return opt
+
+
+def generate_tfprof_profile(profiler, tfprof_file):
+  """Generates a tfprof profile, writing it to a file and printing top ops.
+
+  Args:
+    profiler: A tf.profiler.Profiler. `profiler.add_step` must have already been
+      called.
+    tfprof_file: The filename to write the ProfileProto to.
+  """
+  profile_proto = profiler.serialize_to_string()
+  log_fn('Dumping ProfileProto to %s' % tfprof_file)
+  with gfile.Open(tfprof_file, 'wb') as f:
+    f.write(profile_proto)
+
+  # Print out the execution times of the top operations. Note this
+  # information can also be obtained with the dumped ProfileProto, but
+  # printing it means tfprof doesn't have to be used if all the user wants
+  # is the top ops.
+  options = tf.profiler.ProfileOptionBuilder.time_and_memory()
+  options['max_depth'] = _NUM_OPS_TO_PRINT
+  options['order_by'] = 'accelerator_micros'
+  profiler.profile_operations(options)
+
+
+class BenchmarkCNN(object):
+  """Class for benchmarking a cnn network."""
+
+  def __init__(self, params, dataset=None, model=None):
+    """Initialize BenchmarkCNN.
+
+    Args:
+      params: Params tuple, typically created by make_params or
+              make_params_from_flags.
+      dataset: If not None, the dataset to use. Otherwise, params is used to
+               obtain the dataset.
+      model: If not None, the model to use. Otherwise, params is used to obtain
+             the model.
+    Raises:
+      ValueError: Unsupported params settings.
+    """
+    mlperf.logger.log(key=mlperf.tags.RUN_START)
+    self.params = params
+    if params.eval:
+      self._doing_eval = True
+    else:
+      # Note self._doing_eval can later switch to True in self._do_eval() if
+      # self.params.eval_during_training_* is specified.
+      self._doing_eval = False
+    self.dataset = dataset or datasets.create_dataset(self.params.data_dir,
+                                                      self.params.data_name)
+    self.model = model or model_config.get_model_config(
+        self.params.model, self.dataset, self.params)
+    self.trace_filename = self.params.trace_file
+    self.rewriter_config = self.params.rewriter_config
+    autotune_threshold = self.params.autotune_threshold if (
+        self.params.autotune_threshold) else 1
+    min_autotune_warmup = 5 * autotune_threshold * autotune_threshold
+    self.num_warmup_batches = self.params.num_warmup_batches if (
+        self.params.num_warmup_batches is not None) else max(
+            10, min_autotune_warmup)
+    self.graph_file = self.params.graph_file
+    self.resize_method = self.params.resize_method
+    self.sync_queue_counter = 0
+    self.num_gpus = self.params.num_gpus
+    if self.params.gpu_indices:
+      self.gpu_indices = [int(x) for x in self.params.gpu_indices.split(',')]
+    else:
+      self.gpu_indices = [x for x in range(self.num_gpus)]
+
+    if (self.params.device == 'cpu' and self.params.data_format == 'NCHW' and
+        not self.params.mkl):
+      raise ValueError('device=cpu requires that data_format=NHWC')
+
+    if ((self.params.num_epochs_per_decay or
+         self.params.learning_rate_decay_factor) and
+        not (self.params.init_learning_rate is not None and
+             self.params.num_epochs_per_decay
+             and self.params.learning_rate_decay_factor)):
+      raise ValueError('If one of num_epochs_per_decay or '
+                       'learning_rate_decay_factor is set, both must be set'
+                       'and learning_rate must be set')
+    if (self.params.minimum_learning_rate and
+        not (self.params.init_learning_rate is not None and
+             self.params.num_epochs_per_decay and
+             self.params.learning_rate_decay_factor)):
+      raise ValueError('minimum_learning_rate requires learning_rate,'
+                       'num_epochs_per_decay, and '
+                       'learning_rate_decay_factor to be set')
+
+    if (self.params.use_fp16 and self.params.fp16_vars and
+        'replicated' in self.params.variable_update and
+        self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec):
+      raise ValueError('fp16 variables are not supported with NCCL')
+    if (self.params.use_fp16 and self.params.fp16_vars and
+        self.params.gradient_repacking):
+      raise ValueError('--fp16_vars cannot be used with --gradient_repacking')
+
+    if self.params.variable_update == 'horovod' and self.params.num_gpus > 1:
+      raise ValueError('Horovod benchmarks require num_gpus=1 on each worker')
+
+    if self.params.variable_update == 'horovod' and self.params.job_name:
+      raise ValueError('job_name should not be specified for Horovod.')
+
+    if self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale:
+      if self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec:
+        raise ValueError('Automatic loss scaling is not supported with NCCL.')
+      if self.params.variable_update not in ('parameter_server', 'replicated',
+                                             'independent'):
+        raise ValueError('Automatic loss scaling is not supported with '
+                         'variable_update=%s.' % self.params.variable_update)
+      if self.params.staged_vars:
+        raise ValueError('Automatic loss scaling is not supported with'
+                         'staged_vars.')
+
+    if (self.params.debugger is not None and self.params.debugger != 'cli' and
+        ':' not in self.params.debugger):
+      raise ValueError('--debugger must be "cli" or in the form '
+                       'host:port')
+
+    if self.params.hierarchical_copy and self.params.num_gpus <= 1:
+      raise ValueError('--hierarchical_copy requires --num_gpus to be greater '
+                       'than 1')
+
+    if params.save_model_secs and params.save_model_steps:
+      raise ValueError('At most one of --save_model_secs and '
+                       '--save_model_steps can be specified')
+
+    eval_during_training_flags = list(map(bool, [
+        params.eval_during_training_every_n_steps,
+        params.eval_during_training_every_n_epochs,
+        params.eval_during_training_at_specified_steps,
+        params.eval_during_training_at_specified_epochs,
+    ]))
+
+    if eval_during_training_flags.count(True) > 1:
+      raise ValueError('At most one flag with --eval_during_training_* prefix '
+                       'must be specified.')
+
+    eval_during_training_enabled = any(eval_during_training_flags)
+
+    if eval_during_training_enabled:
+      if params.eval:
+        raise ValueError('At most one of --eval and --eval_during_training_* '
+                         'must be specified')
+      if params.forward_only:
+        raise ValueError('At most one of --forward_only and '
+                         '--eval_during_training_* must be specified')
+      if params.job_name:
+        raise ValueError('--eval_during_training_* is not yet supported in '
+                         'distributed mode.')
+      if params.staged_vars:
+        raise ValueError('--eval_during_training_* is not currently compatible '
+                         'with --staged_vars')
+
+    if params.stop_at_top_1_accuracy and not eval_during_training_enabled:
+      raise ValueError('--stop_at_top_1_accuracy is only supported with '
+                       '--eval_during_training_*')
+    if params.collect_eval_results_async and params.model != 'ssd300':
+      raise ValueError('--collect_eval_results_async only works with ssd300 '
+                       'model currently.')
+    if self.params.forward_only and self.params.freeze_when_forward_only:
+      if self.params.train_dir is not None:
+        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
+                         ' is True, --train_dir should not be specified')
+      if self.params.data_dir and not self.params.datasets_use_prefetch:
+        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
+                         ' is True and --data_dir is set, '
+                         '--datasets_use_prefetch should be set to True')
+      if self.params.job_name:
+        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
+                         ' is True, --job_name should not be specified and '
+                         'distributed running is not supported')
+      self.forward_only_and_freeze = True
+    else:
+      self.forward_only_and_freeze = False
+      if self.params.trt_mode:
+        raise ValueError('--trt_mode should not be specified if one of '
+                         '--forward_only and --freeze_when_forward_only is set '
+                         'to False')
+
+    self.mode = get_mode_from_params(self.params)
+
+    # Use the batch size from the command line if specified, otherwise use the
+    # model's default batch size.  Scale the benchmark's batch size by the
+    # number of GPUs.
+    if self.params.batch_size > 0:
+      self.model.set_batch_size(self.params.batch_size)
+    self.batch_size = self.model.get_batch_size() * self.num_gpus
+    if self.mode in (constants.BenchmarkMode.TRAIN,
+                     constants.BenchmarkMode.TRAIN_AND_EVAL):
+      self.train_batch_size = self.batch_size
+    else:
+      self.train_batch_size = None
+    if self.mode in (constants.BenchmarkMode.EVAL,
+                     constants.BenchmarkMode.TRAIN_AND_EVAL):
+      if self.params.eval_batch_size > 0:
+        self.eval_batch_size = self.params.eval_batch_size * self.num_gpus
+      else:
+        self.eval_batch_size = self.batch_size
+    else:
+      self.eval_batch_size = None
+    self.batch_group_size = self.params.batch_group_size
+    self.enable_auto_loss_scale = (
+        self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale)
+    self.loss_scale = None
+    self.loss_scale_normal_steps = None
+
+    self.job_name = self.params.job_name  # "" for local training
+
+    # PS server is used for distributed jobs not using all-reduce.
+    use_ps_server = self.job_name and (self.params.variable_update !=
+                                       'distributed_all_reduce' and
+                                       self.params.variable_update !=
+                                       'collective_all_reduce')
+    # controller is used for distributed_all_reduce with > 1 worker.
+    use_controller = (
+        self.params.variable_update == 'distributed_all_reduce' and
+        self.job_name)
+    if use_controller and not params.controller_host:
+      raise ValueError('When variable_update==distributed_all_reduce '
+                       'controller_host must also be specified.')
+    # collective_all_reduce doesn't need a controller or ps
+    self.distributed_collective = (
+        self.params.variable_update == 'collective_all_reduce' and
+        self.job_name)
+
+    self.local_parameter_device_flag = self.params.local_parameter_device
+    if self.job_name:
+      self.task_index = self.params.task_index
+      self.cluster_manager = platforms_util.get_cluster_manager(
+          params, create_config_proto(params))
+      assert isinstance(self.cluster_manager, cnn_util.BaseClusterManager)
+
+      worker_prefix = '/job:worker/replica:0/task:%s' % self.task_index
+      if use_ps_server:
+        self.param_server_device = tf.train.replica_device_setter(
+            worker_device=worker_prefix + '/cpu:0',
+            cluster=self.cluster_manager.get_cluster_spec())
+        # This device on which the queues for managing synchronization between
+        # servers should be stored.
+        self.sync_queue_devices = [
+            '/job:ps/replica:0/task:%s/cpu:0' % i
+            for i in range(self.cluster_manager.num_ps())
+        ]
+      else:
+        self.sync_queue_devices = ['/job:worker/replica:0/task:0/cpu:0']
+    else:
+      self.task_index = 0
+      self.cluster_manager = None
+      worker_prefix = ''
+      self.param_server_device = '/%s:0' % self.params.local_parameter_device
+      self.sync_queue_devices = [self.param_server_device]
+
+    if self.cluster_manager:
+      self.num_workers = self.cluster_manager.num_workers()
+    elif self.params.variable_update == 'horovod':
+      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
+      self.num_workers = hvd.size()
+    else:
+      self.num_workers = 1
+    self.num_ps = self.cluster_manager.num_ps() if self.cluster_manager else 0
+
+    if self.num_workers > 1 and self.params.all_reduce_spec == 'nccl':
+      raise ValueError('--all_reduce_spec=nccl is invalid in a '
+                       'multi-worker job')
+
+    # Device to use for ops that need to always run on the local worker's CPU.
+    self.cpu_device = '%s/cpu:0' % worker_prefix
+
+    # Device to use for ops that need to always run on the local worker's
+    # compute device, and never on a parameter server device.
+    self.raw_devices = [
+        '%s/%s:%i' % (worker_prefix, self.params.device, i)
+        for i in xrange(self.num_gpus)
+    ]
+
+    subset = 'validation' if params.eval else 'train'
+    self.num_batches, self.num_epochs = get_num_batches_and_epochs(
+        params, self.batch_size * self.num_workers,
+        self.dataset.num_examples_per_epoch(subset))
+    if self.mode in (constants.BenchmarkMode.EVAL,
+                     constants.BenchmarkMode.TRAIN_AND_EVAL):
+      # TODO(reedwm): Currently we do extra eval logic for num_eval_batches and
+      # the preprocessor. We should encapsulate this logic into a shared
+      # function or class.
+      if params.num_eval_batches is None and params.num_eval_epochs is None:
+        eval_params = self.params
+      else:
+        eval_params = self.params._replace(
+            num_batches=self.params.num_eval_batches,
+            num_epochs=self.params.num_eval_epochs)
+      self.num_eval_batches, self.num_eval_epochs = get_num_batches_and_epochs(
+          eval_params, self.eval_batch_size * self.num_workers,
+          self.dataset.num_examples_per_epoch('validation'))
+    else:
+      self.num_eval_batches, self.num_eval_epochs = None, None
+
+    num_train_examples_per_epoch = self.dataset.num_examples_per_epoch('train')
+    if self.params.eval_during_training_every_n_epochs:
+      n_epochs = self.params.eval_during_training_every_n_epochs
+      self.eval_during_training_at_specified_steps = {
+          (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
+           self.batch_size)
+          for e in np.arange(n_epochs, self.num_epochs, n_epochs)}
+
+    if self.params.eval_during_training_at_specified_steps:
+      try:
+        self.eval_during_training_at_specified_steps = set(map(
+            int, self.params.eval_during_training_at_specified_steps))
+      except ValueError:
+        raise ValueError('Param eval_during_training_at_specified_steps value '
+                         'of %s cannot be converted to a list of integers.' %
+                         (self.params.eval_during_training_at_specified_steps))
+
+    if self.params.eval_during_training_at_specified_epochs:
+      try:
+        n_epochs = list(map(
+            float, self.params.eval_during_training_at_specified_epochs))
+        offset = n_epochs[0] - 1
+        if offset.is_integer():
+          offset = int(offset)
+        mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
+        self.eval_during_training_at_specified_steps = {
+            (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
+             self.batch_size)
+            for e in n_epochs}
+      except ValueError:
+        raise ValueError('Param eval_during_training_at_specified_epochs value '
+                         'of %s cannot be converted to a list of floats.' %
+                         (self.params.eval_during_training_at_specified_epochs))
+
+    if params.eval_during_training_every_n_epochs:
+      offset = params.eval_during_training_every_n_epochs - 1
+      if offset.is_integer():
+        offset = int(offset)
+      mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
+
+    if (self.params.staged_vars and
+        self.params.variable_update != 'parameter_server'):
+      raise ValueError('staged_vars for now is only supported with '
+                       'variable_update=parameter_server')
+
+    if self.params.variable_update == 'parameter_server':
+      if self.job_name:
+        if not self.params.staged_vars:
+          self.variable_mgr = variable_mgr.VariableMgrDistributedFetchFromPS(
+              self)
+        else:
+          self.variable_mgr = (
+              variable_mgr.VariableMgrDistributedFetchFromStagedPS(self))
+      else:
+        if not self.params.staged_vars:
+          self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(self)
+        else:
+          self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromStagedPS(
+              self)
+    elif self.params.variable_update == 'replicated':
+      if self.job_name:
+        raise ValueError('Invalid variable_update in distributed mode: %s' %
+                         self.params.variable_update)
+      self.variable_mgr = variable_mgr.VariableMgrLocalReplicated(
+          self, self.params.all_reduce_spec,
+          self.params.agg_small_grads_max_bytes,
+          self.params.agg_small_grads_max_group,
+          self.params.allreduce_merge_scope)
+    elif self.params.variable_update == 'distributed_all_reduce':
+      assert self.params.cross_replica_sync
+      self.variable_mgr = variable_mgr.VariableMgrDistributedAllReduce(
+          self, self.params.all_reduce_spec,
+          ('worker' if self.num_workers > 1 else 'localhost'),
+          self.num_workers, self.params.agg_small_grads_max_bytes,
+          self.params.agg_small_grads_max_group,
+          self.params.allreduce_merge_scope)
+    elif self.params.variable_update == 'collective_all_reduce':
+      assert self.params.cross_replica_sync
+      self.variable_mgr = variable_mgr.VariableMgrCollectiveAllReduce(
+          self, self.params.all_reduce_spec,
+          self.num_workers, self.num_gpus, self.task_index,
+          self.params.allreduce_merge_scope)
+    elif self.params.variable_update == 'distributed_replicated':
+      assert self.params.cross_replica_sync
+      if not self.job_name:
+        raise ValueError('Invalid variable_update in local mode: %s' %
+                         self.params.variable_update)
+      self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated(self)
+    elif self.params.variable_update in ('independent', 'horovod'):
+      if self.job_name:
+        raise ValueError('Invalid variable_update in distributed mode: %s' %
+                         self.params.variable_update)
+      self.variable_mgr = variable_mgr.VariableMgrIndependent(self)
+    else:
+      raise ValueError(
+          'Invalid variable_update: %s' % self.params.variable_update)
+
+    # Device to use for running on the local worker's compute device, but
+    # with variables assigned to parameter server devices.
+    self.devices = self.variable_mgr.get_devices()
+    if self.job_name:
+      if use_ps_server:
+        self.global_step_device = self.param_server_device
+      elif self.params.variable_update == 'collective_all_reduce':
+        self.global_step_device = self.cpu_device
+      else:
+        self.global_step_device = '/job:worker/replica:0/task:0/cpu:0'
+    else:
+      self.global_step_device = self.cpu_device
+
+    self.input_preprocessor = None
+    self.eval_input_preprocessor = None
+    if not self.dataset.use_synthetic_gpu_inputs():
+      if not self.params.eval:
+        self.input_preprocessor = self.get_input_preprocessor()
+      if self.mode in (constants.BenchmarkMode.EVAL,
+                       constants.BenchmarkMode.TRAIN_AND_EVAL):
+        with self._do_eval():
+          self.eval_input_preprocessor = self.get_input_preprocessor()
+    self.datasets_use_prefetch = (
+        self.params.datasets_use_prefetch and
+        # TODO(rohanj): Figure out why --datasets_use_prefetch freezes on the
+        # CPU.
+        self.params.device.lower() != 'cpu' and
+        self.input_preprocessor and
+        self.input_preprocessor.supports_datasets())
+    self.init_global_step = 0
+
+    self._config_benchmark_logger()
+
+    if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
+      # Remove "eval" from params so it is not accidentally used. Since eval can
+      # still occur despite params.eval being False, params.eval should never
+      # be used. We cannot yet remove this unconditionally, because the SSD
+      # model still uses params.eval, and hence does not work properly with
+      # --eval_during_training_*.
+      # TODO(b/116627045): We should also remove fields that have an eval
+      # equivalent, like num_batches and num_eval_batches.
+      self.params = remove_param_fields(self.params, {'eval'})
+
+  @contextlib.contextmanager
+  def _do_eval(self):
+    """Context manager to switches BenchmarkCNN to eval mode.
+
+    Any evaluation code should be put under this context manager. This context
+    manager switches self._doing_eval to True. It also switches certain
+    attributes, like self.num_batches and self.num_epochs, to be the number of
+    batches and epochs for evaluation respectively
+
+    Yields:
+      Nothing.
+    """
+    # TODO(b/116627045): Find a more general way of switching attributes to the
+    # eval equivalents.
+    old_doing_eval = self._doing_eval
+    old_num_batches = self.num_batches
+    old_num_epochs = self.num_epochs
+    old_batch_size = self.batch_size
+    try:
+      self._doing_eval = True
+      self.num_batches = self.num_eval_batches
+      self.num_epochs = self.num_eval_epochs
+      self.batch_size = self.eval_batch_size
+      self.model.set_batch_size(self.eval_batch_size // self.num_gpus)
+      yield
+    finally:
+      self._doing_eval = old_doing_eval
+      self.num_batches = old_num_batches
+      self.num_epochs = old_num_epochs
+      self.batch_size = old_batch_size
+      self.model.set_batch_size(old_batch_size // self.num_gpus)
+
+  def _config_benchmark_logger(self):
+    """Config the model garden benchmark logger."""
+    model_benchmark_logger = None
+    if self.params.benchmark_log_dir is not None:
+      try:
+        from official.utils.logs import logger as models_logger  # pylint: disable=g-import-not-at-top
+      except ImportError:
+        tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH '
+                         'in order to use BenchmarkLogger. Configured '
+                         'benchmark_log_dir: %s'
+                         % self.params.benchmark_log_dir)
+        raise
+      model_benchmark_logger = models_logger.BenchmarkFileLogger(
+          self.params.benchmark_log_dir)
+    self.benchmark_logger = model_benchmark_logger
+
+  # TODO(laigd): this changes the global device list which is used everywhere,
+  # consider refactoring it.
+  def reset_devices_for_task(self, task_num, is_local=False):
+    """Used to imitate another task when building a distributed graph."""
+    worker_prefix = ('/job:localhost' if is_local else
+                     '/job:worker/replica:0/task:%s' % task_num)
+    self.cpu_device = '%s/cpu:0' % worker_prefix
+    self.raw_devices = [
+        '%s/%s:%i' % (worker_prefix, self.params.device, i)
+        for i in xrange(self.num_gpus)
+    ]
+    self.devices = self.variable_mgr.get_devices()
+
+  def raw_devices_across_tasks(self, is_local=False):
+    """Returns list of raw device names across all tasks."""
+    if is_local:
+      assert self.num_workers == 1
+      return self.raw_devices
+    else:
+      return [
+          'job:worker/replica:0/task%s/%s:%i' % (t, self.params.device, i)
+          for t in xrange(self.num_workers)
+          for i in xrange(self.num_gpus)
+      ]
+
+  def print_info(self):
+    """Print basic information."""
+    benchmark_info = self._get_params_info()
+    log_fn('Model:       %s' % self.model.get_model_name())
+    log_fn('Dataset:     %s' % benchmark_info['dataset_name'])
+    log_fn('Mode:        %s' % self.mode)
+    log_fn('SingleSess:  %s' % benchmark_info['single_session'])
+    log_fn('Batch size:  %s global' % (self.batch_size * self.num_workers))
+    log_fn('             %s per device' % (self.batch_size //
+                                           len(self.raw_devices)))
+    if self.batch_group_size > 1:
+      log_fn('             %d batches per prepocessing group' %
+             self.batch_group_size)
+    log_fn('Num batches: %d' % self.num_batches)
+    log_fn('Num epochs:  %.2f' % self.num_epochs)
+    log_fn('Devices:     %s' % benchmark_info['device_list'])
+    log_fn('NUMA bind:   %s' % self.params.use_numa_affinity)
+    log_fn('Data format: %s' % self.params.data_format)
+    if self.rewriter_config:
+      log_fn('RewriterConfig: %s' % self.rewriter_config)
+    log_fn('Optimizer:   %s' % self.params.optimizer)
+    log_fn('Variables:   %s' % self.params.variable_update)
+    if (self.params.variable_update == 'replicated' or
+        self.params.variable_update == 'distributed_all_reduce'
+        or self.params.variable_update == 'collective_all_reduce'):
+      log_fn('AllReduce:   %s' % self.params.all_reduce_spec)
+    if self.job_name:
+      log_fn('Sync:        %s' % self.params.cross_replica_sync)
+    if self.params.staged_vars:
+      log_fn('Staged vars: %s' % self.params.staged_vars)
+    if self.params.variable_update == 'horovod' and self.params.horovod_device:
+      log_fn('Horovod on:  %s' % self.params.horovod_device)
+    log_fn('==========')
+
+  def _get_params_info(self):
+    """Get the common parameters info for the benchmark run.
+
+    Returns:
+      A dict of processed parameters.
+    """
+    dataset_name = self.dataset.name
+    if self.dataset.use_synthetic_gpu_inputs():
+      dataset_name += ' (synthetic)'
+    single_session = self.params.variable_update == 'distributed_all_reduce'
+    if single_session:
+      device_list = self.raw_devices_across_tasks()
+    elif self.params.variable_update == 'horovod':
+      device_list = ['horovod/%s:%d' % (self.params.device, idx)
+                     for idx in range(self.num_workers)]
+    else:
+      device_list = self.raw_devices
+    return {
+        'dataset_name': dataset_name,
+        'single_session': single_session,
+        'device_list': device_list,}
+
+  def _log_benchmark_run(self):
+    """Log the benchmark info to the logger.
+
+    The info logged here should be similar to print_info(), but in a structured
+    JSON format.
+    """
+    if self.benchmark_logger:
+      benchmark_info = self._get_params_info()
+
+      run_param = {
+          'model': self.model.get_model_name(),
+          'dataset': benchmark_info['dataset_name'],
+          'mode': self.mode,
+          'single_sess': benchmark_info['single_session'],
+          'devices': benchmark_info['device_list'],
+          'batch_size': self.batch_size,
+          'batch_size_per_device': self.batch_size // len(self.raw_devices),
+          'num_batches': self.num_batches,
+          'num_epochs': self.num_epochs,
+          'data_format': self.params.data_format,
+          'rewrite_config': self.rewriter_config,
+          'optimizer': self.params.optimizer,
+          'session_config': create_config_proto(self.params),
+      }
+      # TODO(scottzhu): tf_cnn_benchmark might execute several times with
+      # different param setting on the same box. This will cause the run file to
+      # only contain the latest info. The benchmark_log_dir should be updated
+      # for every new run.
+      self.benchmark_logger.log_run_info(
+          self.model.get_model_name(), benchmark_info['dataset_name'],
+          run_param, test_id=self.params.benchmark_test_id)
+
+  def run(self):
+    """Run the benchmark task assigned to this process.
+
+    Returns:
+      Dictionary of statistics for training or eval.
+    Raises:
+       ValueError: unrecognized job name.
+    """
+    if self.params.job_name == 'ps':
+      log_fn('Running parameter server %s' % self.task_index)
+      self.cluster_manager.join_server()
+      return {}
+
+    # For distributed_all_reduce with multiple workers, drive
+    # from a separate controller process.
+    if self.params.variable_update == 'distributed_all_reduce':
+      if self.params.job_name == 'worker':
+        log_fn('Starting worker %s' % self.task_index)
+        self.cluster_manager.join_server()
+        return
+      elif self.params.job_name and self.params.job_name != 'controller':
+        raise ValueError('unrecognized job name: %s' % self.params.job_name)
+
+    self._log_benchmark_run()
+    if self._doing_eval:
+      with tf.Graph().as_default():
+        # TODO(laigd): freeze the graph in eval mode.
+        return self._run_eval()
+    else:
+      return self._benchmark_train()
+
+  def _run_eval(self):
+    """Evaluate a model every self.params.eval_interval_secs.
+
+    Returns:
+      Dictionary containing eval statistics. Currently returns an empty
+      dictionary.
+
+    Raises:
+      ValueError: If self.params.train_dir is unspecified.
+    """
+    if self.params.train_dir is None:
+      raise ValueError('Trained model directory not specified')
+    graph_info = self._build_eval_graph()
+    saver = tf.train.Saver(self.variable_mgr.savable_variables())
+    summary_writer = tf.summary.FileWriter(self.params.eval_dir,
+                                           tf.get_default_graph())
+    target = ''
+    # TODO(huangyp): Check if checkpoints haven't updated for hours and abort.
+    while True:
+      with tf.Session(
+          target=target, config=create_config_proto(self.params)) as sess:
+        image_producer = None
+        try:
+          global_step = load_checkpoint(saver, sess, self.params.train_dir)
+          image_producer = self._initialize_eval_graph(
+              graph_info.enqueue_ops, graph_info.input_producer_op,
+              graph_info.local_var_init_op_group, sess)
+        except CheckpointNotFoundException:
+          log_fn('Checkpoint not found in %s' % self.params.train_dir)
+        else:  # Only executes if an exception was not thrown
+          self._eval_once(sess, summary_writer, graph_info.fetches,
+                          graph_info.summary_op, image_producer, global_step)
+        if image_producer is not None:
+          image_producer.done()
+        if self.params.eval_interval_secs <= 0:
+          break
+        time.sleep(self.params.eval_interval_secs)
+    return {}
+
+  def _build_eval_graph(self, scope_name=None):
+    """Build the evaluation graph.
+
+    Args:
+      scope_name: String to filter what summaries are collected. Only summary
+        ops whose name contains `scope_name` will be added, which is useful for
+        only including evaluation ops.
+
+    Returns:
+      A GraphInfo named_tuple containing various useful ops and tensors of the
+      evaluation grpah.
+    """
+    with self._do_eval():
+      input_producer_op, enqueue_ops, fetches = self._build_model()
+      local_var_init_op = tf.local_variables_initializer()
+      table_init_ops = tf.tables_initializer()
+      variable_mgr_init_ops = [local_var_init_op]
+      if table_init_ops:
+        variable_mgr_init_ops.extend([table_init_ops])
+      with tf.control_dependencies([local_var_init_op]):
+        variable_mgr_init_ops.extend(self.variable_mgr.get_post_init_ops())
+      local_var_init_op_group = tf.group(*variable_mgr_init_ops)
+
+      summary_op = tf.summary.merge_all(scope=scope_name)
+      # The eval graph has no execution barrier because it doesn't run in
+      # distributed mode.
+      execution_barrier = None
+      # We do not use the global step during evaluation.
+      global_step = None
+      return GraphInfo(input_producer_op, enqueue_ops, fetches,
+                       execution_barrier, global_step, local_var_init_op_group,
+                       summary_op)
+
+  # TODO(reedwm): For consistency, we should have a similar
+  # "_initialize_train_graph" function. They can likely be the same function.
+  def _initialize_eval_graph(self, enqueue_ops, input_producer_op,
+                             local_var_init_op_group, sess):
+    """Initializes the evaluation graph.
+
+    Args:
+      enqueue_ops: Ops that adds the preprocessed images to the staging areas.
+      input_producer_op: Op that produce the input batches (before
+        preprocessing).
+      local_var_init_op_group: Group of ops that perform per-device
+        initialization work.
+      sess: The session to initialize the eval graph with.
+
+    Returns:
+      An ImageProducer, or None if an ImageProducer isn't being used.
+    """
+    with self._do_eval():
+      if local_var_init_op_group is not None:
+        # We might reinitialize local variables if they were already initialized
+        # during training. This is OK.
+        sess.run(local_var_init_op_group)
+      if self.dataset.queue_runner_required():
+        tf.train.start_queue_runners(sess=sess)
+      image_producer = None
+      if input_producer_op is not None:
+        image_producer = cnn_util.ImageProducer(
+            sess, input_producer_op, self.batch_group_size,
+            self.params.use_python32_barrier)
+        image_producer.start()
+      if enqueue_ops:
+        for i in xrange(len(enqueue_ops)):
+          sess.run(enqueue_ops[:(i + 1)])
+          if image_producer is not None:
+            image_producer.notify_image_consumption()
+      return image_producer
+
+  def _eval_once(self, sess, summary_writer, fetches, summary_op,
+                 image_producer, global_step):
+    """Evaluate the model using the validation dataset."""
+    with self._do_eval():
+      mlperf.logger.log_eval_epoch(
+          mlperf.tags.EVAL_START, global_step, self.batch_size)
+      loop_start_time = start_time = time.time()
+      # TODO(laigd): refactor the part to compute/report the accuracy. Currently
+      # it only works for image models.
+      top_1_accuracy_sum = 0.0
+      top_5_accuracy_sum = 0.0
+      total_eval_count = self.num_batches * self.batch_size
+      pred_classes = []
+      for step in xrange(self.num_batches):
+        if (summary_writer and self.params.save_summaries_steps > 0 and
+            (step + 1) % self.params.save_summaries_steps == 0):
+          results, summary_str = sess.run([fetches, summary_op])
+          summary_writer.add_summary(summary_str)
+        else:
+          results = sess.run(fetches)
+        # Make global_step available in results for postprocessing.
+        results['global_step'] = global_step
+        results = self.model.postprocess(results)
+        pred_classes.append(results['all_logits'])
+        top_1_accuracy_sum += results['top_1_accuracy']
+        top_5_accuracy_sum += results['top_5_accuracy']
+        if (step + 1) % self.params.display_every == 0:
+          duration = time.time() - start_time
+          examples_per_sec = (
+              self.batch_size * self.params.display_every / duration)
+          log_fn('%i\t%.1f examples/sec' % (step + 1, examples_per_sec))
+          start_time = time.time()
+        if image_producer is not None:
+          image_producer.notify_image_consumption()
+      pred_classes = np.squeeze(np.array(pred_classes))
+      save_postfix = 'nv' if 'nv' in self.params.save_dir else 'bi'
+      np.save('{}/pred_classes_{}_{}.npy'.format(self.params.save_dir, self.params.model, save_postfix), pred_classes)
+      loop_end_time = time.time()
+      accuracy_at_1 = top_1_accuracy_sum / self.num_batches
+      accuracy_at_5 = top_5_accuracy_sum / self.num_batches
+      summary = tf.Summary()
+      summary.value.add(tag='eval/Accuracy@1', simple_value=accuracy_at_1)
+      summary.value.add(tag='eval/Accuracy@5', simple_value=accuracy_at_5)
+      for result_key, result_value in results.items():
+        if result_key.startswith(constants.SIMPLE_VALUE_RESULT_PREFIX):
+          prefix_len = len(constants.SIMPLE_VALUE_RESULT_PREFIX)
+          summary.value.add(tag='eval/' + result_key[prefix_len:],
+                            simple_value=result_value)
+      if summary_writer:
+        summary_writer.add_summary(summary, global_step)
+      log_fn('Accuracy @ 1 = %.4f Accuracy @ 5 = %.4f [%d examples]' %
+             (accuracy_at_1, accuracy_at_5, total_eval_count))
+      elapsed_time = loop_end_time - loop_start_time
+      images_per_sec = (self.num_batches * self.batch_size / elapsed_time)
+      if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
+        # Note that we compute the top 1 accuracy and top 5 accuracy for each
+        # batch, which will have a slight performance impact.
+        log_fn('-' * 64)
+        log_fn('total images/sec: %.2f' % images_per_sec)
+        log_fn('-' * 64)
+      if self.benchmark_logger:
+        eval_result = {
+            'eval_top_1_accuracy', accuracy_at_1,
+            'eval_top_5_accuracy', accuracy_at_5,
+            'eval_average_examples_per_sec', images_per_sec,
+            tf.GraphKeys.GLOBAL_STEP, global_step,
+        }
+        self.benchmark_logger.log_evaluation_result(eval_result)
+      mlperf.logger.log_eval_epoch(
+          mlperf.tags.EVAL_STOP, global_step, self.batch_size)
+      mlperf.logger.log(key=mlperf.tags.EVAL_SIZE,
+                        value=self.num_batches * self.batch_size)
+      if self.params.model != 'ssd300':  # ssd300 logs eval accuracy elsewhere.
+        mlperf.logger.log_eval_accuracy(
+            accuracy_at_1, global_step, self.train_batch_size,
+            examples_per_epoch=self.dataset.num_examples_per_epoch('train'))
+      if self.params.stop_at_top_1_accuracy:
+        mlperf.logger.log(key=mlperf.tags.EVAL_TARGET,
+                          value=self.params.stop_at_top_1_accuracy)
+      return accuracy_at_1, accuracy_at_5
+
+  def _benchmark_train(self):
+    """Run cnn in benchmark mode. Skip the backward pass if forward_only is on.
+
+    Returns:
+      Dictionary containing training statistics (num_workers, num_steps,
+      average_wall_time, images_per_sec).
+    """
+    graph = tf.Graph()
+    with graph.as_default():
+      build_result = self._build_graph()
+      if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
+        with self.variable_mgr.reuse_variables():
+          with tf.name_scope('Evaluation') as ns:
+            eval_build_results = self._build_eval_graph(ns)
+      else:
+        eval_build_results = None
+    (graph, result_to_benchmark) = self._preprocess_graph(graph, build_result)
+    with graph.as_default():
+      return self._benchmark_graph(result_to_benchmark, eval_build_results)
+
+  GPU_CACHED_INPUT_VARIABLE_NAME = 'gpu_cached_inputs'
+
+  def _unfreezable_local_variables(self, graph):
+    """Get the local variables that we don't want to freeze."""
+    return graph.get_collection(
+        tf.GraphKeys.LOCAL_VARIABLES,
+        # We don't freeze the gpu_cached_images local variable so it won't get
+        # constant folded with ops which process the input.
+        scope='.*' + BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME)
+
+  def _build_graph(self):
+    """Build the graph.
+
+    Returns:
+      A namedtuple containing the ops/tensors that required by
+      _benchmark_graph().
+    """
+    if self.params.variable_update == 'distributed_all_reduce':
+      self.single_session = True
+      (input_producer_op, enqueue_ops, fetches) = (
+          self._build_model_single_session())
+    else:
+      self.single_session = False
+      (input_producer_op, enqueue_ops, fetches) = self._build_model()
+    fetches_list = nest.flatten(list(fetches.values()))
+    main_fetch_group = tf.group(*fetches_list, name='main_fetch_group')
+    execution_barrier = None
+    if (not self.single_session and self.job_name and
+        not self.params.cross_replica_sync):
+      execution_barrier = self.add_sync_queues_and_barrier(
+          'execution_barrier_', [])
+
+    global_step = tf.train.get_global_step()
+    with tf.device(self.global_step_device), tf.name_scope('inc_global_step'):
+      with tf.control_dependencies([main_fetch_group]):
+        fetches['inc_global_step'] = global_step.assign_add(1)
+
+    if ((not self.single_session) and (not self.distributed_collective) and
+        self.job_name and self.params.cross_replica_sync):
+      # Block all replicas until all replicas are ready for next step.
+      fetches['sync_queues'] = self.add_sync_queues_and_barrier(
+          'sync_queues_step_end_', [main_fetch_group])
+
+    # Skips the init ops for freezable local variables in forward_only mode so
+    # we can remove all the assign ops when converting variables to constants.
+    with tf.name_scope('local_variable_initialization'):
+      if self.forward_only_and_freeze:
+        local_var_init_op = tf.variables_initializer(
+            self._unfreezable_local_variables(tf.get_default_graph()))
+      else:
+        local_var_init_op = tf.local_variables_initializer()
+    table_init_ops = tf.tables_initializer()
+
+    variable_manager_init_ops = [local_var_init_op]
+    if table_init_ops:
+      variable_manager_init_ops.extend([table_init_ops])
+    if not self.forward_only_and_freeze:
+      with tf.control_dependencies([local_var_init_op]):
+        variable_manager_init_ops.extend(self.variable_mgr.get_post_init_ops())
+    if ((not self.single_session) and (not self.distributed_collective) and
+        self.job_name and self.params.cross_replica_sync):
+      # Ensure all workers execute variable_manager_init_ops before they start
+      # executing the model.
+      variable_manager_init_ops.append(
+          self.add_sync_queues_and_barrier('init_ops_end_',
+                                           variable_manager_init_ops))
+    local_var_init_op_group = tf.group(*variable_manager_init_ops,
+                                       name='local_var_init_op_group')
+    summary_op = tf.summary.merge_all()
+
+    return GraphInfo(
+        input_producer_op=input_producer_op,
+        enqueue_ops=enqueue_ops,
+        fetches=fetches,
+        execution_barrier=execution_barrier,
+        global_step=global_step,
+        local_var_init_op_group=local_var_init_op_group,
+        summary_op=summary_op)
+
+  def _benchmark_graph(self, graph_info, eval_graph_info):
+    """Benchmark the training graph.
+
+    Args:
+      graph_info: the namedtuple returned by _build_graph() which
+        contains all necessary information to benchmark the graph, including
+        named tensors/ops list, fetches, etc.
+      eval_graph_info: Similar to graph_info but for the eval graph if
+        --eval_during_training_* is used. Otherwise, None.
+    Returns:
+      Dictionary containing training statistics (num_workers, num_steps,
+      average_wall_time, images_per_sec).
+    """
+    log_fn('Initializing graph')
+    if self.params.variable_update == 'horovod':
+      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
+      # First worker will be 'chief' - it will write summaries and
+      # save checkpoints.
+      is_chief = hvd.rank() == 0
+    else:
+      is_chief = (not self.job_name or self.task_index == 0)
+
+    summary_writer = None
+    if (is_chief and self.params.summary_verbosity and self.params.train_dir and
+        self.params.save_summaries_steps > 0):
+      summary_writer = tf.summary.FileWriter(self.params.train_dir,
+                                             tf.get_default_graph())
+
+    # We want to start the benchmark timer right after a image_producer barrier
+    # and avoids undesired waiting times on barriers.
+    if ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
+        self.batch_group_size) != 0:
+      self.num_warmup_batches = int(
+          math.ceil(
+              (self.num_warmup_batches + len(graph_info.enqueue_ops) - 1.0) /
+              (self.batch_group_size)) * self.batch_group_size -
+          len(graph_info.enqueue_ops) + 1)
+      log_fn('Round up warm up steps to %d to match batch_group_size' %
+             self.num_warmup_batches)
+      assert ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
+              self.batch_group_size) == 0
+    # We run the summaries in the same thread as the training operations by
+    # passing in None for summary_op to avoid a summary_thread being started.
+    # Running summaries and training operations in parallel could run out of
+    # GPU memory.
+    if is_chief and not self.forward_only_and_freeze:
+      saver = tf.train.Saver(
+          self.variable_mgr.savable_variables(),
+          save_relative_paths=True,
+          max_to_keep=self.params.max_ckpts_to_keep)
+    else:
+      saver = None
+    ready_for_local_init_op = None
+    if self.job_name and not (self.single_session or
+                              self.distributed_collective):
+      # In distributed mode, we don't want to run local_var_init_op_group until
+      # the global variables are initialized, because local_var_init_op_group
+      # may use global variables (such as in distributed replicated mode). We
+      # don't set this in non-distributed mode, because in non-distributed mode,
+      # local_var_init_op_group may itself initialize global variables (such as
+      # in replicated mode).
+      ready_for_local_init_op = tf.report_uninitialized_variables(
+          tf.global_variables())
+    if self.params.variable_update == 'horovod':
+      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
+      bcast_global_variables_op = hvd.broadcast_global_variables(0)
+    else:
+      bcast_global_variables_op = None
+
+    if self.params.variable_update == 'collective_all_reduce':
+      # It doesn't matter what this collective_graph_key value is,
+      # so long as it's > 0 and the same at every worker.
+      init_run_options = tf.RunOptions()
+      init_run_options.experimental.collective_graph_key = 6
+    else:
+      init_run_options = tf.RunOptions()
+    local_var_init_ops = [graph_info.local_var_init_op_group]
+    if eval_graph_info:
+      # `eval_graph_info.local_var_init_op_group` also includes some of the
+      # training initializer ops, since it's difficult to filter them out.
+      # Rerunning the training initializer ops is OK, but we add a control
+      # dependency since running two sets of training initializer ops at the
+      # same time can cause race conditions.
+      with tf.control_dependencies(local_var_init_ops):
+        local_var_init_ops.append(eval_graph_info.local_var_init_op_group)
+    sv = tf.train.Supervisor(
+        # For the purpose of Supervisor, all Horovod workers are 'chiefs',
+        # since we want session to be initialized symmetrically on all the
+        # workers.
+        is_chief=is_chief or (self.params.variable_update == 'horovod'
+                              or self.distributed_collective),
+        # Log dir should be unset on non-chief workers to prevent Horovod
+        # workers from corrupting each other's checkpoints.
+        logdir=self.params.train_dir if is_chief else None,
+        ready_for_local_init_op=ready_for_local_init_op,
+        local_init_op=local_var_init_ops,
+        saver=saver,
+        global_step=graph_info.global_step,
+        summary_op=None,
+        save_model_secs=self.params.save_model_secs,
+        summary_writer=summary_writer,
+        local_init_run_options=init_run_options)
+
+    profiler = tf.profiler.Profiler() if self.params.tfprof_file else None
+    if self.graph_file is not None:
+      path, filename = os.path.split(self.graph_file)
+      as_text = filename.endswith('txt')
+      log_fn('Writing GraphDef as %s to %s' % (  # pyformat break
+          'text' if as_text else 'binary', self.graph_file))
+      tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
+                           path, filename, as_text)
+
+    start_standard_services = (
+        self.params.train_dir or
+        self.dataset.queue_runner_required())
+    target = self.cluster_manager.get_target() if self.cluster_manager else ''
+    with sv.managed_session(
+        master=target,
+        config=create_config_proto(self.params),
+        start_standard_services=start_standard_services) as sess:
+      # Anything that can potentially raise an OutOfRangeError with 'sess' MUST
+      # be under this try block. The managed_session() context manager silently
+      # ignores OutOfRangeError, so we must catch them and wrap them with
+      # a different exception type so that they can be propagated up to the
+      # caller.
+      try:
+        stats = self.benchmark_with_session(
+            sess, sv, graph_info, eval_graph_info, bcast_global_variables_op,
+            is_chief, summary_writer, profiler)
+      except tf.errors.OutOfRangeError:
+        raise RuntimeError(
+            'Received OutOfRangeError. Wrapping in Runtime error to avoid '
+            'Supervisor from suppressing the error. Original OutOfRangeError '
+            'with traceback:\n' + traceback.format_exc())
+
+    sv.stop()
+    if profiler:
+      generate_tfprof_profile(profiler, self.params.tfprof_file)
+    return stats
+
+  def benchmark_with_session(self, sess, supervisor, graph_info,
+                             eval_graph_info, bcast_global_variables_op,
+                             is_chief, summary_writer, profiler):
+    """Benchmarks the graph with the given session.
+
+    Args:
+      sess: The session to benchmark the graph with
+      supervisor: The Supervisor that created the session.
+      graph_info: the namedtuple returned by _build_graph() which
+        contains all necessary information to benchmark the graph, including
+        named tensors/ops list, fetches, etc.
+      eval_graph_info: Similar to graph_info but for the eval graph if
+        --eval_during_training_every_n_steps is used. Otherwise, None.
+      bcast_global_variables_op: If Horovod is used, the op to broadcast the
+        global variables to all the processes. None if Horovod is not used.
+      is_chief: True if this is the chief process.
+      summary_writer: The SummaryWriter used to write summaries, or None if
+        summaries are not used.
+      profiler: The tf.profiler.Profiler, or None if tfprof is not used.
+
+    Returns:
+      Dictionary containing training statistics (num_workers, num_steps,
+      average_wall_time, images_per_sec).
+    """
+    if self.params.backbone_model_path is not None:
+      self.model.load_backbone_model(sess, self.params.backbone_model_path)
+    if bcast_global_variables_op:
+      sess.run(bcast_global_variables_op)
+    image_producer = None
+    if graph_info.input_producer_op is not None:
+      image_producer = cnn_util.ImageProducer(
+          sess, graph_info.input_producer_op, self.batch_group_size,
+          self.params.use_python32_barrier)
+      image_producer.start()
+    if graph_info.enqueue_ops:
+      for i in xrange(len(graph_info.enqueue_ops)):
+        sess.run(graph_info.enqueue_ops[:(i + 1)])
+        if image_producer is not None:
+          image_producer.notify_image_consumption()
+    self.init_global_step, = sess.run([graph_info.global_step])
+    if self.job_name and not self.params.cross_replica_sync:
+      # TODO(zhengxq): Do we need to use a global step watcher at all?
+      global_step_watcher = GlobalStepWatcher(
+          sess, graph_info.global_step,
+          self.num_workers * self.num_warmup_batches +
+          self.init_global_step,
+          self.num_workers * (self.num_warmup_batches + self.num_batches) - 1)
+      global_step_watcher.start()
+    else:
+      global_step_watcher = None
+    eval_image_producer = None
+    if eval_graph_info:
+      # We pass local_var_init_op_group=None because the Supervisor already
+      # initialized local variables above. We need to have the Supervisor
+      # initialize the local variables, because otherwise it throws an error
+      # complaining that not all variables were initialized.
+      eval_image_producer = self._initialize_eval_graph(
+          eval_graph_info.enqueue_ops, eval_graph_info.input_producer_op,
+          local_var_init_op_group=None, sess=sess)
+    step_train_times = []
+    log_fn('Running warm up')
+    local_step = -1 * self.num_warmup_batches
+    if self.single_session:
+      # In single session mode, each step, the global_step is incremented by
+      # 1. In non-single session mode, each step, the global_step is
+      # incremented once per worker. This means we need to divide
+      # init_global_step by num_workers only in non-single session mode.
+      end_local_step = self.num_batches - self.init_global_step
+    else:
+      end_local_step = self.num_batches - (self.init_global_step //
+                                           self.num_workers)
+    if not global_step_watcher:
+      # In cross-replica sync mode, all workers must run the same number of
+      # local steps, or else the workers running the extra step will block.
+      done_fn = lambda: local_step >= end_local_step
+    else:
+      done_fn = global_step_watcher.done
+    if self.params.debugger is not None:
+      if self.params.debugger == 'cli':
+        log_fn('The CLI TensorFlow debugger will be used.')
+        sess = tf_debug.LocalCLIDebugWrapperSession(sess)
+      else:
+        log_fn('The TensorBoard debugger plugin will be used.')
+        sess = tf_debug.TensorBoardDebugWrapperSession(sess,
+                                                       self.params.debugger)
+    mlperf.logger.log(key=mlperf.tags.TRAIN_LOOP)
+    skip_final_eval = False
+    accuracy_at_1 = None
+    accuracy_at_5 = None
+    last_eval_step = local_step
+    loop_start_time = time.time()
+    last_average_loss = None
+    while not done_fn():
+      if local_step == 0:
+        log_fn('Done warm up')
+        if graph_info.execution_barrier:
+          log_fn('Waiting for other replicas to finish warm up')
+          sess.run([graph_info.execution_barrier])
+
+        # TODO(laigd): rename 'Img' to maybe 'Input'.
+        header_str = ('Step\tImg/sec\t' +
+                      self.params.loss_type_to_report.replace('/', ' '))
+        if self.params.print_training_accuracy or self.params.forward_only:
+          # TODO(laigd): use the actual accuracy op names of the model.
+          header_str += '\ttop_1_accuracy\ttop_5_accuracy'
+        log_fn(header_str)
+        assert len(step_train_times) == self.num_warmup_batches
+        # reset times to ignore warm up batch
+        step_train_times = []
+        loop_start_time = time.time()
+      if (summary_writer and
+          (local_step + 1) % self.params.save_summaries_steps == 0):
+        fetch_summary = graph_info.summary_op
+      else:
+        fetch_summary = None
+      collective_graph_key = 7 if (
+          self.params.variable_update == 'collective_all_reduce') else 0
+      (summary_str, last_average_loss) = benchmark_one_step(
+          sess, graph_info.fetches, local_step,
+          self.batch_size * (self.num_workers
+                             if self.single_session else 1), step_train_times,
+          self.trace_filename, self.params.partitioned_graph_file_prefix,
+          profiler, image_producer, self.params, fetch_summary,
+          benchmark_logger=self.benchmark_logger,
+          collective_graph_key=collective_graph_key)
+      if summary_str is not None and is_chief:
+        supervisor.summary_computed(sess, summary_str)
+      local_step += 1
+      if (self.params.save_model_steps and
+          local_step % self.params.save_model_steps == 0 and
+          local_step > 0 and
+          is_chief):
+        supervisor.saver.save(sess, supervisor.save_path,
+                              supervisor.global_step)
+      if (eval_graph_info and local_step > 0 and not done_fn() and
+          self._should_eval_during_training(local_step)):
+        python_global_step = sess.run(graph_info.global_step)
+        num_steps_since_last_eval = local_step - last_eval_step
+        # The INPUT_SIZE tag value might not match the
+        # PREPROC_NUM_TRAIN_EXAMPLES tag value, because the number of examples
+        # run, which is INPUT_SIZE, is rounded up to the nearest multiple of
+        # self.batch_size.
+        mlperf.logger.log(
+            key=mlperf.tags.INPUT_SIZE,
+            value=num_steps_since_last_eval * self.batch_size)
+        log_fn('Running evaluation at global_step {}'.format(
+            python_global_step))
+        accuracy_at_1, accuracy_at_5 = self._eval_once(
+            sess, summary_writer, eval_graph_info.fetches,
+            eval_graph_info.summary_op, eval_image_producer,
+            python_global_step)
+        last_eval_step = local_step
+        if (self.params.stop_at_top_1_accuracy and
+            accuracy_at_1 >= self.params.stop_at_top_1_accuracy):
+          log_fn('Stopping, as eval accuracy at least %s was reached' %
+                 self.params.stop_at_top_1_accuracy)
+          skip_final_eval = True
+          break
+        else:
+          log_fn('Resuming training')
+      if eval_graph_info and self.model.reached_target():
+        log_fn('Stopping, as the model indicates its custom goal was reached')
+        skip_final_eval = True
+        break
+    loop_end_time = time.time()
+    # Waits for the global step to be done, regardless of done_fn.
+    if global_step_watcher:
+      while not global_step_watcher.done():
+        time.sleep(.25)
+    if not global_step_watcher:
+      elapsed_time = loop_end_time - loop_start_time
+      average_wall_time = elapsed_time / local_step if local_step > 0 else 0
+      images_per_sec = (self.num_workers * local_step * self.batch_size /
+                        elapsed_time)
+      num_steps = local_step * self.num_workers
+    else:
+      # NOTE: Each worker independently increases the global step. So,
+      # num_steps will be the sum of the local_steps from each worker.
+      num_steps = global_step_watcher.num_steps()
+      elapsed_time = global_step_watcher.elapsed_time()
+      average_wall_time = (elapsed_time * self.num_workers / num_steps
+                           if num_steps > 0 else 0)
+      images_per_sec = num_steps * self.batch_size / elapsed_time
+
+    # We skip printing images/sec if --eval_during_training_* is specified,
+    # because we are both processing training and evaluation images, so a
+    # singular "images/sec" value is meaningless.
+    if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
+      log_fn('-' * 64)
+      # TODO(laigd): rename 'images' to maybe 'inputs'.
+      log_fn('total images/sec: %.2f' % images_per_sec)
+      log_fn('-' * 64)
+    else:
+      log_fn('Done with training')
+    num_steps_since_last_eval = local_step - last_eval_step
+    mlperf.logger.log(
+        key=mlperf.tags.INPUT_SIZE,
+        value=num_steps_since_last_eval * self.batch_size)
+    python_global_step = sess.run(graph_info.global_step)
+    if eval_graph_info and not skip_final_eval:
+      log_fn('Running final evaluation at global_step {}'.format(
+          python_global_step))
+      accuracy_at_1, accuracy_at_5 = self._eval_once(
+          sess, summary_writer, eval_graph_info.fetches,
+          eval_graph_info.summary_op, eval_image_producer, python_global_step)
+    num_epochs_ran = (python_global_step * self.batch_size /
+                      self.dataset.num_examples_per_epoch('train'))
+    mlperf.logger.log_train_epochs(num_epochs_ran)
+    if image_producer is not None:
+      image_producer.done()
+    if eval_image_producer is not None:
+      eval_image_producer.done()
+    if is_chief:
+      if self.benchmark_logger:
+        self.benchmark_logger.log_metric(
+            'average_examples_per_sec', images_per_sec, global_step=num_steps)
+
+    # Save the model checkpoint.
+    if self.params.train_dir is not None and is_chief:
+      checkpoint_path = os.path.join(self.params.train_dir, 'model.ckpt')
+      if not gfile.Exists(self.params.train_dir):
+        gfile.MakeDirs(self.params.train_dir)
+      supervisor.saver.save(sess, checkpoint_path, graph_info.global_step)
+    if graph_info.execution_barrier:
+      # Wait for other workers to reach the end, so this worker doesn't
+      # go away underneath them.
+      sess.run([graph_info.execution_barrier])
+    stats = {
+        'num_workers': self.num_workers,
+        'num_steps': num_steps,
+        'average_wall_time': average_wall_time,
+        'images_per_sec': images_per_sec
+    }
+    if last_average_loss is not None:
+      stats['last_average_loss'] = last_average_loss
+    if accuracy_at_1 is not None:
+      stats['top_1_accuracy'] = accuracy_at_1
+    if accuracy_at_5 is not None:
+      stats['top_5_accuracy'] = accuracy_at_5
+
+    success = bool(self.model.reached_target() or
+                   (accuracy_at_1 and self.params.stop_at_top_1_accuracy and
+                    accuracy_at_1 >= self.params.stop_at_top_1_accuracy))
+    mlperf.logger.log(key=mlperf.tags.RUN_STOP, value={'success': success})
+    mlperf.logger.log(key=mlperf.tags.RUN_FINAL)
+    return stats
+
+  def _should_eval_during_training(self, step):
+    """Return True iff should run eval during training at current step."""
+
+    assert self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL
+
+    if self.params.eval_during_training_every_n_steps:
+      return step % self.params.eval_during_training_every_n_steps == 0
+
+    # All other --eval_during_training_* flags are converted to step numbers
+    # at which the model should run evaluation during training.
+    return step in self.eval_during_training_at_specified_steps
+
+  def _preprocess_graph(self, graph, graph_info):
+    """Preprocess the graph before executing.
+
+    Depending on the params, it runs various preprocessing on the graph,
+    including freezing, TensorRT conversion, etc.
+
+    Args:
+      graph: the graph to preprocess.
+      graph_info: the namedtuple returned by _build_graph() which
+        contains all necessary information to benchmark the graph, including
+        named tensors/ops list, fetches, etc.
+
+    Returns:
+      The updated graph and graph_info with the ops/tensors/fetches updated
+      according to the imported graph.
+    """
+    assert isinstance(graph_info.fetches, dict)
+    assert isinstance(graph_info.global_step, tf.Variable)
+    if not self.forward_only_and_freeze:
+      return (graph, graph_info)
+
+    # Get the names of the ops that need to keep during conversion.
+    flattened_op_names = list(
+        set([
+            v.name.split(':')[0]
+            for v in nest.flatten(graph_info)
+            if v is not None
+        ]))
+    # Get variables that we don't want to freeze.
+    # Only keep unfreezable variables in forward_only_and_freeze mode.
+    # TODO(laigd): consider making global_step a constant.
+    variables_to_keep = {graph_info.global_step: tf.GraphKeys.GLOBAL_VARIABLES}
+    variables_to_keep.update({
+        local_variable: tf.GraphKeys.LOCAL_VARIABLES
+        for local_variable in self._unfreezable_local_variables(graph)
+    })
+
+    variable_initializers = [
+        variable.initializer.name for variable in variables_to_keep]
+    output_node_names = (
+        flattened_op_names +
+        # Add variable initializer and read ops to the output list, so
+        # convert_variables_to_constants() will keep them.
+        variable_initializers +
+        [variable.value().op.name for variable in variables_to_keep])
+    graphdef = graph.as_graph_def(add_shapes=True)
+
+    # Freeze the graph.
+    with graph.as_default():
+      with tf.Session(config=create_config_proto(self.params)) as sess:
+        sess.run(tf.global_variables_initializer())
+        sess.run(tf.local_variables_initializer())
+        graphdef = graph_util.convert_variables_to_constants(
+            sess,
+            graphdef,
+            output_node_names,
+            variable_names_blacklist=[
+                variable.op.name for variable in variables_to_keep
+            ])
+
+    # Run TensorRT conversion.
+    if self.params.trt_mode:
+      # Import here instead of at top, because this will crash if TensorRT is
+      # not installed
+      from tensorflow.python.compiler.tensorrt import trt_convert  # pylint: disable=g-import-not-at-top
+      # Avoid TF-TRT bridge from touching all variable initializer ops and their
+      # dependencies, since they can directly be fetched by sess.run()s that
+      # initialize the variables.
+      # pylint: disable=protected-access
+      name_to_input_name, _, _ = graph_util_impl._extract_graph_summary(
+          graphdef)
+      initializer_subgraph_ops = graph_util_impl._bfs_for_reachable_nodes(
+          variable_initializers, name_to_input_name)
+      # pylint: enable=protected-access
+
+      graphdef = trt_convert.create_inference_graph(
+          graphdef,
+          outputs=output_node_names + list(initializer_subgraph_ops),
+          max_batch_size=self.model.get_batch_size(),
+          max_workspace_size_bytes=self.params.trt_max_workspace_size_bytes,
+          precision_mode=self.params.trt_mode)
+
+    # Creates a new graph as the default and import the converted graph back.
+    updated_graph = tf.Graph()
+
+    def _get_tensors_or_ops(inputs):
+      """Gets the updated tensors or ops from 'updated_graph'."""
+
+      def _get_fn(element):
+        if element is None:
+          return None
+        if ':' in element.name:
+          return updated_graph.get_tensor_by_name(element.name)
+        return updated_graph.get_operation_by_name(element.name)
+
+      if isinstance(inputs, (list, dict, tuple)):
+        return nest.map_structure(_get_fn, inputs)
+      else:
+        return _get_fn(inputs)
+
+    with updated_graph.as_default():
+      importer.import_graph_def(graph_def=graphdef, name='')
+
+      # Update the variables
+      for variable in variables_to_keep:
+        updated_variable = tf.Variable.from_proto(variable.to_proto())
+        tf.add_to_collection(variables_to_keep[variable], updated_variable)
+        if variable is graph_info.global_step:
+          updated_global_step = updated_variable
+
+    updated_graph_info = GraphInfo(
+        input_producer_op=_get_tensors_or_ops(graph_info.input_producer_op),
+        enqueue_ops=_get_tensors_or_ops(graph_info.enqueue_ops),
+        execution_barrier=_get_tensors_or_ops(graph_info.execution_barrier),
+        local_var_init_op_group=_get_tensors_or_ops(
+            graph_info.local_var_init_op_group),
+        fetches=_get_tensors_or_ops(graph_info.fetches),
+        global_step=updated_global_step,
+        summary_op=None)
+    return (updated_graph, updated_graph_info)
+
+  def _build_input_processing(self, shift_ratio=0):
+    """"Build the image (pre)processing portion of the model graph.
+
+    Args:
+      shift_ratio: shift_ratio for data_flow_ops.RecordInput.
+
+    Returns:
+      An InputProcessingInfo containing all the input sources to the model.
+    """
+    input_processing_info = InputProcessingInfo(
+        input_producer_op=None,
+        input_producer_stages=None,
+        multi_device_iterator_input=None)
+
+    mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
+    if not self._doing_eval:
+      mlperf.logger.log(key=mlperf.tags.INPUT_BATCH_SIZE, value=self.batch_size)
+
+    # If using synthetic gpu inputs, do nothing on the cpu side.
+    if self.dataset.use_synthetic_gpu_inputs():
+      assert not self.datasets_use_prefetch
+      return input_processing_info
+
+    if self._doing_eval:
+      input_preprocessor = self.eval_input_preprocessor
+      mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES,
+                        value=self.dataset.num_examples_per_epoch('validation'))
+    else:
+      input_preprocessor = self.input_preprocessor
+      mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES,
+                        value=self.dataset.num_examples_per_epoch('train'))
+
+    # Use prefetching mechanism provided by dataset input pipeline.
+    if self.datasets_use_prefetch:
+      multi_device_iterator = (
+          input_preprocessor.build_multi_device_iterator(
+              self.batch_size, len(self.devices), self.cpu_device, self.params,
+              self.raw_devices, self.dataset, self._doing_eval))
+      return input_processing_info._replace(
+          multi_device_iterator_input=multi_device_iterator.get_next())
+
+    # Not using dataset prefetching. Use a staging area to mimic the prefetching
+    # behavior instead.
+    with tf.device(self.cpu_device):
+      if self._doing_eval:
+        subset = 'validation'
+      else:
+        subset = 'train'
+      input_list = input_preprocessor.minibatch(
+          self.dataset,
+          subset=subset,
+          params=self.params,
+          shift_ratio=shift_ratio)
+
+      input_producer_op = []
+      input_producer_stages = []
+      for device_num in range(len(self.devices)):
+        staging_area = data_flow_ops.StagingArea(
+            [parts[0].dtype for parts in input_list],
+            shapes=[parts[0].get_shape() for parts in input_list],
+            shared_name='input_producer_staging_area_%d_eval_%s' %
+            (device_num, self._doing_eval))
+        input_producer_stages.append(staging_area)
+        for group_index in xrange(self.batch_group_size):
+          batch_index = group_index + device_num * self.batch_group_size
+          put_op = staging_area.put(
+              [parts[batch_index] for parts in input_list])
+          input_producer_op.append(put_op)
+      assert input_producer_op
+
+    return input_processing_info._replace(
+        input_producer_op=input_producer_op,
+        input_producer_stages=input_producer_stages)
+
+  def _maybe_initialize_fp16(self):
+    """Initialize fp16 settings."""
+    if self.params.use_fp16 and not self._doing_eval:
+      init_loss_scale_val = float(self.params.fp16_loss_scale or
+                                  self.model.get_fp16_loss_scale())
+      self.loss_scale = None
+      self.loss_scale_normal_steps = None
+      if self.enable_auto_loss_scale or init_loss_scale_val != 1:
+        self.loss_scale = tf.get_variable(
+            name='loss_scale',
+            initializer=init_loss_scale_val,
+            dtype=tf.float32,
+            trainable=False)
+      if self.enable_auto_loss_scale:
+        self.loss_scale_normal_steps = tf.get_variable(
+            name='loss_scale_normal_steps', initializer=0, trainable=False)
+
+  def _build_model(self):
+    """Build the TensorFlow graph."""
+    if self.datasets_use_prefetch:
+      assert not self.params.staged_vars
+      assert not self.variable_mgr.supports_staged_vars()
+
+    # Adjust seed so different workers start read different input files.
+    if self.params.variable_update == 'horovod':
+      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
+      seed_adjustment = hvd.rank()
+    else:
+      seed_adjustment = 0
+    mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
+                      value=self.params.tf_random_seed + seed_adjustment)
+    tf.set_random_seed(self.params.tf_random_seed + seed_adjustment)
+    mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
+                      value=4321 + seed_adjustment)
+    np.random.seed(4321 + seed_adjustment)
+    phase_train = not (self._doing_eval or self.params.forward_only)
+
+    if self._doing_eval:
+      mode_string = 'evaluation'
+    else:
+      mode_string = 'training'
+
+    log_fn('Generating {} model'.format(mode_string))
+    losses = []
+    device_grads = []
+    all_logits = []
+    all_accuracy_ops = {}
+    gpu_compute_stage_ops = []
+    gpu_grad_stage_ops = []
+
+    with tf.device(self.global_step_device):
+      global_step = tf.train.get_or_create_global_step()
+      self._maybe_initialize_fp16()
+
+    # Build the processing and model for the worker.
+    input_producer_op = None
+    with tf.name_scope('input_processing'):
+      input_processing_info = self._build_input_processing(shift_ratio=0)
+      if input_processing_info.input_producer_op is not None:
+        input_producer_op = tf.group(*input_processing_info.input_producer_op)
+    update_ops = None
+    staging_delta_ops = []
+
+    for device_num in range(len(self.devices)):
+      with tf.name_scope('tower_%i' % device_num) as name_scope, (
+          self.variable_mgr.create_outer_variable_scope(device_num)):
+        results = self.add_forward_pass_and_gradients(
+            phase_train, device_num, device_num, input_processing_info,
+            gpu_compute_stage_ops, gpu_grad_stage_ops)
+
+        if self.params.backbone_model_path:
+          self.model.add_backbone_saver()
+
+        if phase_train:
+          losses.append(results['loss'])
+          device_grads.append(results['gradvars'])
+        else:
+          all_logits.append(results['logits'])
+        if not phase_train or self.params.print_training_accuracy:
+          for name, op in results.items():
+            if name.startswith('accuracy:'):
+              key = name[9:]
+              if key not in all_accuracy_ops:
+                all_accuracy_ops[key] = []
+              all_accuracy_ops[key].append(op)
+
+        if device_num == 0:
+          # Retain the Batch Normalization updates operations only from the
+          # first tower. These operations update the moving mean and moving
+          # variance variables, which are updated (but not used) during
+          # training, and used during evaluation. The moving mean and variance
+          # approximate the true mean and variance across all images in the
+          # dataset. Therefore, in replicated mode, these moving averages would
+          # be almost identical for each tower, and so we only update and save
+          # the moving averages for one tower. In parameter server mode, all
+          # towers share a copy of the variables so we also only need to update
+          # and save the moving averages once.
+          update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
+          if self.datasets_use_prefetch:
+            assert not self.variable_mgr.staging_delta_ops
+          else:
+            staging_delta_ops = list(self.variable_mgr.staging_delta_ops)
+
+    enqueue_ops = []
+    if not self.datasets_use_prefetch:
+      if self.variable_mgr.supports_staged_vars():
+        for staging_ops in self.variable_mgr.staging_vars_on_devices:
+          gpu_compute_stage_ops.extend(
+              [put_op for _, (put_op, _) in six.iteritems(staging_ops)])
+      enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
+                                  name='gpu_compute_stage_ops_group'))
+      if gpu_grad_stage_ops:
+        staging_delta_ops += gpu_grad_stage_ops
+      if staging_delta_ops:
+        enqueue_ops.append(tf.group(*(staging_delta_ops)))
+
+    if (self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL and
+        self.params.variable_update == 'replicated'):
+      # We need to get all the update ops instead of only those for the first
+      # tower. This is because during evaluation, each tower will read from its
+      # own tower's moving averages instead of the first tower's moving
+      # averages.
+      # TODO(reedwm): Have each tower read from the first tower's moving
+      # averages for a slight performance gain.
+      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+      mlperf.logger.log(key=mlperf.tags.INPUT_BN_SPAN,
+                        value=self.batch_size // len(self.raw_devices))
+
+    fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
+                                  enqueue_ops, update_ops, all_accuracy_ops,
+                                  phase_train)
+    fetches['all_logits'] = all_logits
+    return (input_producer_op, enqueue_ops, fetches)
+
+  def _build_fetches(self, global_step, all_logits, losses, device_grads,
+                     enqueue_ops, update_ops, all_accuracy_ops, phase_train):
+    """Complete construction of model graph, populating the fetches map."""
+    fetches = {}
+    if enqueue_ops:
+      fetches['enqueue_ops'] = enqueue_ops
+    for name, ops in all_accuracy_ops.items():
+      # For fetches that starts with 'tensor:', keep dimension and skip reducing
+      # them to scalars.
+      if name.startswith(constants.UNREDUCED_ACCURACY_OP_PREFIX):
+        key = name[len(constants.UNREDUCED_ACCURACY_OP_PREFIX):]
+        fetches[key] = tf.concat(ops, 0)
+      else:
+        fetches[name] = tf.reduce_sum(ops) / self.batch_size
+        if self.task_index == 0 and self.params.summary_verbosity >= 1:
+          tf.summary.scalar(name, fetches[name])
+
+    if not phase_train:
+      if self.params.forward_only:
+        fetches['all_logits'] = tf.concat(all_logits, 0)
+      return fetches
+    apply_gradient_devices, gradient_state = (
+        self.variable_mgr.preprocess_device_grads(device_grads))
+
+    # TODO(reedwm): Greatly simplify the learning rate code.
+    if (self.params.variable_update == 'horovod' or
+        self.params.variable_update == 'collective_all_reduce'):
+      # Each worker independently increments global_step.
+      examples_per_step = self.batch_size * self.num_workers
+    else:
+      # global_step is shared by all workers, and so every iteration
+      # global_step is incremented by num_workers.
+      examples_per_step = self.batch_size
+    if self.params.compute_lr_on_cpu:
+      with tf.device(self.cpu_device):
+        learning_rate = get_learning_rate(self.params, global_step,
+                                          self.dataset.num_examples_per_epoch(),
+                                          self.model, examples_per_step)
+
+    training_ops = []
+    for d, device in enumerate(apply_gradient_devices):
+      with tf.device(device):
+        with tf.name_scope('average_loss'):
+          average_loss = tf.reduce_mean(losses)
+        with tf.name_scope('get_gradients_to_apply'):
+          avg_grads = self.variable_mgr.get_gradients_to_apply(d,
+                                                               gradient_state)
+
+        if not self.params.compute_lr_on_cpu:
+          # We compute the learning rate once for each device in
+          # `apply_gradient_devices`.
+          learning_rate = get_learning_rate(
+              self.params, global_step, self.dataset.num_examples_per_epoch(),
+              self.model, examples_per_step)
+        gradient_clip = self.params.gradient_clip
+        if gradient_clip is not None:
+          with tf.name_scope('clip_gradients'):
+            clipped_grads = [(tf.clip_by_value(grad, -gradient_clip,
+                                               +gradient_clip), var)
+                             for grad, var in avg_grads]
+        else:
+          clipped_grads = avg_grads
+
+        learning_rate = tf.identity(learning_rate, name='learning_rate_tensor')
+        opt = get_optimizer(self.params, learning_rate)
+        loss_scale_params = variable_mgr_util.AutoLossScaleParams(
+            enable_auto_loss_scale=self.enable_auto_loss_scale,
+            loss_scale=self.loss_scale,
+            loss_scale_normal_steps=self.loss_scale_normal_steps,
+            inc_loss_scale_every_n=self.params.fp16_inc_loss_scale_every_n,
+            is_chief=not self.job_name or self.task_index == 0)
+
+        with tf.name_scope('append_apply_gradient_ops'):
+          self.variable_mgr.append_apply_gradients_ops(
+              gradient_state, opt, clipped_grads, training_ops,
+              loss_scale_params)
+    train_op = tf.group(*(training_ops + update_ops), name='train_ops_group')
+
+    with tf.device(self.cpu_device):
+      if self.task_index == 0 and self.params.summary_verbosity >= 1:
+        tf.summary.scalar('learning_rate', learning_rate)
+        tf.summary.scalar(self.params.loss_type_to_report, average_loss)
+        if self.loss_scale is not None:
+          tf.summary.scalar('loss_scale', self.loss_scale)
+        if self.loss_scale_normal_steps:
+          tf.summary.scalar('loss_scale_normal_steps',
+                            self.loss_scale_normal_steps)
+
+        if self.params.summary_verbosity >= 2:
+          self.gradient_histogram_summary(avg_grads)
+
+        if self.params.summary_verbosity >= 3:
+          for grad, var in avg_grads:
+            if grad is not None:
+              tf.summary.histogram(var.op.name + '/gradients', grad)
+          for var in tf.trainable_variables():
+            tf.summary.histogram(var.op.name, var)
+
+    fetches['train_op'] = train_op
+    fetches['average_loss'] = average_loss
+    return fetches
+
+  def gradient_histogram_summary(self, avg_grads):
+    """Create histogram of log values of all non-zero gradients."""
+    with tf.name_scope('log_gradients_summary'):
+      all_grads = []
+      for grad, _ in avg_grads:
+        all_grads.append(tf.reshape(grad, [-1]))
+      grads = tf.abs(tf.concat(all_grads, 0))
+      # exclude grads with zero values.
+      indices_for_non_zero_grads = tf.where(tf.not_equal(grads, 0))
+      log_grads = tf.reshape(
+          tf.log(tf.gather(grads, indices_for_non_zero_grads)), [-1])
+      tf.summary.histogram('log_gradients', log_grads)
+
+  def _build_model_single_session(self):
+    """Build the TensorFlow graph for multiple replicas in a single_session.
+
+    Returns:
+      input_producer_op:
+      enqueue_ops:
+      fetches:
+
+    Raises:
+       ValueError: optimizer not recognized.
+
+    Single session runs multiple model replicas as part of one large
+    distributed graph, whose global execution is always step-synchronized.
+    """
+    # verify assumptions
+    assert self.params.task_index == 0
+    assert not self._doing_eval
+    assert not self.params.forward_only
+    assert not self.params.staged_vars
+
+    tf.set_random_seed(self.params.tf_random_seed)
+    np.random.seed(4321)
+    phase_train = True
+
+    log_fn('Generating training model')
+    losses = []
+    device_grads = []
+    all_logits = []
+    all_accuracy_ops = {}
+    gpu_compute_stage_ops = []
+    gpu_grad_stage_ops = []
+
+    with tf.device(self.global_step_device):
+      global_step = tf.train.get_or_create_global_step()
+
+    update_ops = []
+    global_input_producer_op = []
+
+    is_local = not self.job_name
+    if is_local:
+      assert self.num_workers == 1
+    for task_num in range(self.num_workers):
+      # Reset the devices that self.variable_mgr knows about to those
+      # belonging to the next worker (task).
+      self.reset_devices_for_task(task_num, is_local)
+      # Build the per-worker image processing
+      with tf.name_scope('input_processing'):
+        input_processing_info = self._build_input_processing(
+            shift_ratio=(task_num / self.num_workers))
+      if input_processing_info.input_producer_op is not None:
+        global_input_producer_op.extend(input_processing_info.input_producer_op)
+      # Build the per-worker model replica.
+      for rel_device_num in range(len(self.devices)):
+        abs_device_num = task_num * len(self.devices) + rel_device_num
+        with self.variable_mgr.create_outer_variable_scope(
+            abs_device_num), tf.name_scope(
+                'task_%i_tower_%i' % (task_num, rel_device_num)) as name_scope:
+          task_results = self.add_forward_pass_and_gradients(
+              phase_train, rel_device_num, abs_device_num,
+              input_processing_info, gpu_compute_stage_ops, gpu_grad_stage_ops)
+
+          if self.params.backbone_model_path:
+            self.model.add_backbone_saver()
+
+          if phase_train:
+            losses.append(task_results['loss'])
+            device_grads.append(task_results['gradvars'])
+          else:
+            all_logits.append(task_results['logits'])
+          if not phase_train or self.params.print_training_accuracy:
+            for name, op in task_results.items():
+              if name.startswith('accuracy:'):
+                key = name[9:]
+                if key not in all_accuracy_ops:
+                  all_accuracy_ops[key] = []
+                all_accuracy_ops[key].append(op)
+
+          if rel_device_num == 0:
+            # Retain the Batch Normalization updates operations only
+            # from the first tower. These operations update the moving
+            # mean and moving variance variables, which are updated
+            # (but not used) during training, and used during
+            # evaluation. The moving mean and variance approximate the
+            # true mean and variance across all images in the
+            # dataset. Therefore, in replicated mode, these moving
+            # averages would be almost identical for each tower, and
+            # so we only update and save the moving averages for one
+            # tower. In parameter server mode, all towers share a copy
+            # of the variables so we also only need to update and save
+            # the moving averages once.
+            update_ops.extend(
+                tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope))
+            assert not self.variable_mgr.staging_delta_ops
+
+    enqueue_ops = []
+    if gpu_compute_stage_ops:
+      enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
+                                  name='gpu_compute_stage_ops'))
+    assert not self.variable_mgr.supports_staged_vars()
+    assert not gpu_grad_stage_ops
+
+    fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
+                                  enqueue_ops, update_ops, all_accuracy_ops,
+                                  phase_train)
+    if global_input_producer_op:
+      global_input_producer_op = tf.group(*global_input_producer_op)
+    else:
+      global_input_producer_op = None
+    return (global_input_producer_op, enqueue_ops, fetches)
+
+  def add_forward_pass_and_gradients(self,
+                                     phase_train,
+                                     rel_device_num,
+                                     abs_device_num,
+                                     input_processing_info,
+                                     gpu_compute_stage_ops,
+                                     gpu_grad_stage_ops):
+    """Add ops for forward-pass and gradient computations."""
+    nclass = self.dataset.num_classes
+    if self.datasets_use_prefetch:
+      assert input_processing_info.multi_device_iterator_input, (
+          'multi_device_iterator_input cannot be None if '
+          'datasets_use_prefetch=True')
+      input_list = (
+          input_processing_info.multi_device_iterator_input[rel_device_num])
+    else:
+      if not self.dataset.use_synthetic_gpu_inputs():
+        input_producer_stage = input_processing_info.input_producer_stages[
+            rel_device_num]
+        with tf.device(self.cpu_device):
+          host_input_list = input_producer_stage.get()
+        with tf.device(self.raw_devices[rel_device_num]):
+          gpu_compute_stage = data_flow_ops.StagingArea(
+              [inp.dtype for inp in host_input_list],
+              shapes=[inp.get_shape() for inp in host_input_list])
+          # The CPU-to-GPU copy is triggered here.
+          gpu_compute_stage_op = gpu_compute_stage.put(host_input_list)
+          input_list = gpu_compute_stage.get()
+          gpu_compute_stage_ops.append(gpu_compute_stage_op)
+      else:
+        with tf.device(self.raw_devices[rel_device_num]):
+          # Minor hack to avoid H2D copy when using synthetic data
+          input_list = self.model.get_synthetic_inputs(
+              BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME, nclass)
+
+    # Labels reshaping happens all on gpu:0. Reshaping synthetic labels on
+    # multiple devices slows down XLA computation for an unknown reason.
+    # TODO(b/116875203): Find/address root cause of XLA slow down.
+    labels_device_placement_hack = (
+        self.dataset.use_synthetic_gpu_inputs() and self.params.xla_compile)
+
+    def device_aware_reshape(tensor, shape):
+      device = self.devices[rel_device_num]
+      # Labels are int32, place reshapes on gpu:0 (no device placement) when the
+      # hack is enabled.
+      if labels_device_placement_hack and tensor.dtype == tf.int32:
+        device = ''
+      with tf.device(device):
+        return tf.reshape(tensor, shape=shape)
+
+    subset = 'validation' if self._doing_eval else 'train'
+    input_shapes = self.model.get_input_shapes(subset)
+    input_list = [
+        device_aware_reshape(input_list[i], shape=input_shapes[i])
+        for i in range(len(input_list))
+    ]
+
+    def forward_pass_and_gradients():
+      """Builds forward pass and gradient computation network.
+
+      When phase_train=True and print_training_accuracy=False:
+        return [loss] + grads
+
+      When phase_train=True and print_training_accuracy=True:
+        return [logits, loss] + grads
+
+      When phase_train=False,
+        return [logits]
+
+      Its output can always be unpacked by
+
+      ```
+        outputs = forward_pass_and_gradients()
+        logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
+      ```
+
+      Returns:
+        outputs: A list of tensors depending on different modes.
+      """
+
+      build_network_result = self.model.build_network(
+          input_list, phase_train, nclass)
+      logits = build_network_result.logits
+
+      if not phase_train:
+        return [logits]
+
+      base_loss = self.model.loss_function(input_list, build_network_result)
+      params = self.variable_mgr.trainable_variables_on_device(
+          rel_device_num, abs_device_num)
+      l2_loss = None
+      total_loss = base_loss
+      with tf.name_scope('l2_loss'):
+        fp32_params = params
+        if self.model.data_type == tf.float16 and self.params.fp16_vars:
+          # fp16 reductions are very slow on GPUs, so cast to fp32 before
+          # calling tf.nn.l2_loss and tf.add_n.
+          # TODO(b/36217816): Once the bug is fixed, investigate if we should do
+          # this reduction in fp16.
+          fp32_params = (tf.cast(p, tf.float32) for p in params)
+        filtered_params = self.model.filter_l2_loss_vars(fp32_params)
+        if rel_device_num == len(self.devices) - 1:
+          # We compute the L2 loss for only one device instead of all of them,
+          # because the L2 loss for each device is the same. To adjust for this,
+          # we multiply the L2 loss by the number of devices. We choose the
+          # last device because for some reason, on a Volta DGX1, the first four
+          # GPUs take slightly longer to complete a step than the last four.
+          # TODO(reedwm): Shard the L2 loss computations across GPUs.
+          if self.params.single_l2_loss_op:
+            # TODO(reedwm): If faster, create a fused op that does the L2 loss
+            # on multiple tensors, and use that instead of concatenating
+            # tensors.
+            reshaped_params = [tf.reshape(p, (-1,)) for p in filtered_params]
+            l2_loss = tf.nn.l2_loss(tf.concat(reshaped_params, axis=0))
+          else:
+            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in filtered_params])
+      weight_decay = self.params.weight_decay
+      mlperf.logger.log(key=mlperf.tags.OPT_WEIGHT_DECAY, value=weight_decay)
+      if (weight_decay is not None and weight_decay != 0. and
+          l2_loss is not None):
+        mlperf.logger.log(key=mlperf.tags.MODEL_L2_REGULARIZATION,
+                          value=weight_decay)
+        total_loss += len(self.devices) * weight_decay * l2_loss
+
+      aggmeth = tf.AggregationMethod.DEFAULT
+      scaled_loss = (total_loss if self.loss_scale is None
+                     else total_loss * self.loss_scale)
+      grads = tf.gradients(scaled_loss, params, aggregation_method=aggmeth)
+      if self.params.sparse_to_dense_grads:
+        # Passing a sparse gradient to convert_to_tensor turns it into a dense
+        # gradient. A sparse gradient is an instance of tf.IndexedSlices.
+        # convert_to_tensor does not modify dense tensors.
+        grads = [tf.convert_to_tensor(g) for g in grads]
+      if self.loss_scale is not None:
+        # TODO(reedwm): If automatic loss scaling is not used, we could avoid
+        # these multiplications by directly modifying the learning rate instead.
+        # If this is done, care must be taken to ensure that this scaling method
+        # is correct, as some optimizers square gradients and do other
+        # operations which might not be compatible with modifying both the
+        # gradients and the learning rate.
+
+        grads = [
+            grad * tf.cast(1. / self.loss_scale, grad.dtype) for grad in grads
+        ]
+
+      if self.params.variable_update == 'horovod':
+        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
+        if self.params.horovod_device:
+          horovod_device = '/%s:0' % self.params.horovod_device
+        else:
+          horovod_device = ''
+        # All-reduce gradients using Horovod.
+        grads = [hvd.allreduce(grad, average=False, device_dense=horovod_device)
+                 for grad in grads]
+
+      if self.params.staged_vars:
+        grad_dtypes = [grad.dtype for grad in grads]
+        grad_shapes = [grad.shape for grad in grads]
+        grad_stage = data_flow_ops.StagingArea(grad_dtypes, grad_shapes)
+        grad_stage_op = grad_stage.put(grads)
+        # In general, this decouples the computation of the gradients and
+        # the updates of the weights.
+        # During the pipeline warm up, this runs enough training to produce
+        # the first set of gradients.
+        gpu_grad_stage_ops.append(grad_stage_op)
+        grads = grad_stage.get()
+
+      if self.params.loss_type_to_report == 'total_loss':
+        loss = total_loss
+      else:
+        loss = base_loss
+
+      if self.params.print_training_accuracy:
+        return [logits, loss] + grads
+      else:
+        return [loss] + grads
+
+    def unpack_forward_pass_and_gradients_output(forward_pass_and_grad_outputs):
+      """Unpacks outputs from forward_pass_and_gradients.
+
+      Args:
+        forward_pass_and_grad_outputs: Output from forward_pass_and_gradients.
+
+      Returns:
+        logits: Unscaled probability distribution from forward pass.
+          If unavailable, None is returned.
+        loss: Loss function result from logits.
+          If unavailable, None is returned.
+        grads: Gradients for all trainable variables.
+          If unavailable, None is returned.
+      """
+      logits = None
+      # logits is only fetched in non-train mode or when
+      # print_training_accuracy is set.
+      if not phase_train or self.params.print_training_accuracy:
+        logits = forward_pass_and_grad_outputs.pop(0)
+
+      loss = (
+          forward_pass_and_grad_outputs[0]
+          if forward_pass_and_grad_outputs else None)
+      grads = (
+          forward_pass_and_grad_outputs[1:]
+          if forward_pass_and_grad_outputs else None)
+
+      return logits, loss, grads
+
+    def make_results(logits, loss, grads):
+      """Generate results based on logits, loss and grads."""
+      results = {}  # The return value
+
+      if logits is not None:
+        results['logits'] = logits
+        accuracy_ops = self.model.accuracy_function(input_list, logits)
+        for name, op in accuracy_ops.items():
+          results['accuracy:' + name] = op
+
+      if loss is not None:
+        results['loss'] = loss
+
+      if grads is not None:
+        param_refs = self.variable_mgr.trainable_variables_on_device(
+            rel_device_num, abs_device_num, writable=True)
+        results['gradvars'] = list(zip(grads, param_refs))
+
+      return results
+
+    with tf.device(self.devices[rel_device_num]):
+      outputs = maybe_compile(forward_pass_and_gradients, self.params)
+      logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
+      return make_results(logits, loss, grads)
+
+  def get_input_preprocessor(self):
+    """Returns the image preprocessor to used, based on the model.
+
+    Returns:
+      The image preprocessor, or None if synthetic data should be used.
+    """
+    shift_ratio = 0
+    if self.job_name:
+      # shift_ratio prevents multiple workers from processing the same batch
+      # during a step
+      shift_ratio = self.task_index / self.num_workers
+
+    processor_class = self.dataset.get_input_preprocessor(
+        self.params.input_preprocessor)
+    assert processor_class
+    subset = 'validation' if self._doing_eval else 'train'
+    return processor_class(
+        self.batch_size * self.batch_group_size,
+        self.model.get_input_shapes(subset),
+        len(self.devices) * self.batch_group_size,
+        dtype=self.model.data_type,
+        train=(not self._doing_eval),
+        # TODO(laigd): refactor away image model specific parameters.
+        distortions=self.params.distortions,
+        resize_method=self.resize_method,
+        shift_ratio=shift_ratio,
+        summary_verbosity=self.params.summary_verbosity,
+        distort_color_in_yiq=self.params.distort_color_in_yiq,
+        fuse_decode_and_crop=self.params.fuse_decode_and_crop,
+        match_mlperf=self.params.ml_perf)
+
+  def add_sync_queues_and_barrier(self, name_prefix, enqueue_after_list):
+    """Adds ops to enqueue on all worker queues.
+
+    Args:
+      name_prefix: prefixed for the shared_name of ops.
+      enqueue_after_list: control dependency from ops.
+
+    Returns:
+      An op that should be used as control dependency before starting next step.
+    """
+    self.sync_queue_counter += 1
+    with tf.device(self.sync_queue_devices[(
+        self.sync_queue_counter % len(self.sync_queue_devices))]):
+      sync_queues = [
+          tf.FIFOQueue(self.num_workers, [tf.bool], shapes=[[]],
+                       shared_name='%s%s' % (name_prefix, i))
+          for i in range(self.num_workers)]
+      queue_ops = []
+      # For each other worker, add an entry in a queue, signaling that it can
+      # finish this step.
+      token = tf.constant(False)
+      with tf.control_dependencies(enqueue_after_list):
+        for i, q in enumerate(sync_queues):
+          if i == self.task_index:
+            queue_ops.append(tf.no_op())
+          else:
+            queue_ops.append(q.enqueue(token))
+
+      # Drain tokens off queue for this worker, one for each other worker.
+      queue_ops.append(
+          sync_queues[self.task_index].dequeue_many(len(sync_queues) - 1))
+
+      return tf.group(*queue_ops)
+
+
+def _is_mkl_flag_absent(mkl_flag):
+  return not (absl_flags.FLAGS.is_parsed() and mkl_flag in absl_flags.FLAGS
+              and absl_flags.FLAGS[mkl_flag].present)
+
+
+def _print_os_env_ignored_warning(mkl_flag, flag_default_val, os_env_var):
+  tf.logging.warn(
+      ('OS ENV variable %s=%s is ignored and script default: '
+       '%s is used. Use --%s to override.') %
+      (os_env_var, os.environ[os_env_var], flag_default_val, mkl_flag))
+
+
+def set_default_param_values_and_env_vars(params):
+  """Sets up the default param values and environment variables ."""
+  if params.batchnorm_persistent:
+    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+  else:
+    os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
+  if params.winograd_nonfused:
+    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+  else:
+    os.environ.pop('TF_ENABLE_WINOGRAD_NONFUSED', None)
+  if params.autotune_threshold:
+    os.environ['TF_AUTOTUNE_THRESHOLD'] = str(params.autotune_threshold)
+  os.environ['TF_SYNC_ON_FINISH'] = str(int(params.sync_on_finish))
+  argparse.ArgumentParser(
+      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+  # Sets environment variables for MKL
+  # If OS ENV vars are overridden by script defaults, a warning msg is printed.
+  if params.mkl:
+    mkl_flags = ['kmp_blocktime', 'kmp_settings', 'kmp_affinity',
+                 'num_intra_threads']
+    for mkl_flag in mkl_flags:
+      os_env_var = mkl_flag.upper()
+      if mkl_flag == 'num_intra_threads':
+        os_env_var = 'OMP_NUM_THREADS'
+      flag_val = str(getattr(params, mkl_flag))
+      if _is_mkl_flag_absent(mkl_flag) and os_env_var in os.environ:
+        _print_os_env_ignored_warning(mkl_flag, flag_val, os_env_var)
+      os.environ[os_env_var] = flag_val
+      if mkl_flag == 'num_intra_threads' and not params.num_intra_threads:
+        os.environ.pop(os_env_var, None)
+
+  # Sets GPU thread settings
+  if params.device.lower() == 'gpu':
+    params = params._replace(gpu_thread_mode=params.gpu_thread_mode.lower())
+    if params.gpu_thread_mode not in ['global', 'gpu_shared', 'gpu_private']:
+      raise ValueError('Invalid gpu_thread_mode: %s' % params.gpu_thread_mode)
+    os.environ['TF_GPU_THREAD_MODE'] = params.gpu_thread_mode
+
+    if params.per_gpu_thread_count and params.gpu_thread_mode == 'global':
+      raise ValueError(
+          'Invalid per_gpu_thread_count with gpu_thread_mode=global: %s' %
+          params.per_gpu_thread_count)
+    # Default to two threads. One for the device compute and the other for
+    # memory copies.
+    per_gpu_thread_count = params.per_gpu_thread_count or 2
+    total_gpu_thread_count = per_gpu_thread_count * params.num_gpus
+
+    if params.gpu_thread_mode == 'gpu_private':
+      os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
+    elif params.gpu_thread_mode == 'gpu_shared':
+      os.environ['TF_GPU_THREAD_COUNT'] = str(total_gpu_thread_count)
+
+    cpu_count = multiprocessing.cpu_count()
+    if not params.num_inter_threads and params.gpu_thread_mode in [
+        'gpu_private', 'gpu_shared'
+    ]:
+      main_thread_count = max(cpu_count - total_gpu_thread_count, 1)
+      params = params._replace(num_inter_threads=main_thread_count)
+
+    if (params.datasets_use_prefetch and
+        params.datasets_num_private_threads is None):
+      # From the total cpu thread count, subtract the total_gpu_thread_count,
+      # and then 2 threads per GPU device for event monitoring and sending /
+      # receiving tensors
+      num_monitoring_threads = 2 * params.num_gpus
+      num_private_threads = max(
+          cpu_count - total_gpu_thread_count - num_monitoring_threads, 1)
+      params = params._replace(datasets_num_private_threads=num_private_threads)
+  return params
+
+
+def setup(params):
+  """Sets up the environment that BenchmarkCNN should run in.
+
+  Args:
+    params: Params tuple, typically created by make_params or
+      make_params_from_flags.
+
+  Returns:
+    A potentially modified params.
+  Raises:
+    ValueError: invalid parames combinations.
+  """
+  # Set up environment variables before doing any other global initialization to
+  # make sure it uses the appropriate environment variables.
+  params = set_default_param_values_and_env_vars(params)
+
+  # horovod needs to be initialized before create_config_proto() call since
+  # it will be used in config generation if enabled.
+  if params.variable_update == 'horovod':
+    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
+    hvd.init()
+
+  platforms_util.initialize(params, create_config_proto(params))
+
+  if not params.job_name:
+    # Create a dummy session to initialize TF global variables using the input
+    # params. Otherwise, ListDevices function may create global devices using
+    # the default config instead of using the user provided config.
+    #
+    # TODO(hinsu): Find a way to achieve the same for distributed benchmark. It
+    # is not legal to create distributed session after local session. It is also
+    # not possible to create distributed session here as that results in
+    # multiple creation of ClusterManager and Server.
+    with tf.Session(config=create_config_proto(params)) as sess:
+      del sess
+
+  return params
+
+
+def maybe_compile(computation, params):
+  if params and params.xla_compile:
+    return tf.xla.experimental.compile(computation)
+  else:
+    return computation()
diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test.py b/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..43dac487f90e1014f9429b12a89fa93ac5ef19be
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test.py
@@ -0,0 +1,493 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests running benchmark_cnn in distributed mode.
+
+This is done by spawning one process per task. Each process runs
+benchmark_cnn_distributed_test_runner.py.
+
+The output for each process is written to disk and can be viewed to debug tests.
+See get_test_output_dir() in platforms/default/util.py for more info.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from collections import namedtuple
+import os
+import subprocess
+import time
+import unittest
+
+from absl import flags as absl_flags
+import portpicker
+import six
+import tensorflow.compat.v1 as tf
+import flags
+import test_util
+from platforms import util as platforms_util
+
+FLAGS = absl_flags.FLAGS
+
+
+def _convert_params_to_flags_list(params):
+  """Converts Params to a list of flags. Skips default-valued parameters.
+
+  E.g., converts
+    benchmark_cnn.make_params(batch_size=32, model='resnet50')
+  to
+    ['--batch_size=32', '--model=resnet50']
+
+  Args:
+    params: Params for BenchmarkCNN.
+  Returns:
+    A list of flags.
+  """
+  return [
+      '--%s=%s' % (k, str(v)) for k, v in six.iteritems(params._asdict())
+      if v != flags.param_specs[k].default_value
+  ]
+
+
+# When outputting a process's output in the log, maximum number of characters
+# to output. The log system does not allow us to output more than this in a
+# single log message, but this limit is also useful to avoid the logs from
+# becoming too large (the full process output is written to disk).
+MAX_OUTPUT_CHARS = 15000
+
+
+# A process. name is a string identifying the process in logs. stdout and
+# stderr are file objects of the process's stdout and stderr, respectively.
+_ProcessInfo = namedtuple('_ProcessInfo', ['name', 'popen', 'stdout', 'stderr'])
+
+
+def _create_task_process(job_name, task_index, args, env, output_dir):
+  """Creates a process for a single task for benchmark_cnn.
+
+  Args:
+    job_name: 'worker' or 'ps' or ''. Empty string used for non-distributed
+      mode.
+    task_index: The index of the task within the cluster.
+    args: A list of arguments to pass to the task. This function additionally
+      sets --task_index and --job_name
+    env: The environment to use for the task.
+    output_dir: Where to place the output files, storing the task's stdout and
+      stderr.
+  Returns:
+    A _ProcessInfo namedtuple of the running process. The stdout and stderr
+    fields of this tuple must be closed by the caller once the process ends.
+  """
+  args = args[:]
+  args += ['--task_index=%s' % task_index, '--job_name=%s' % job_name]
+  name_prefix = job_name or 'local'
+  process_name = '%s_%s' % (name_prefix, task_index)
+  tf.logging.info('Spawning %s process: %s' % (process_name, ' '.join(args)))
+  stdout_filename = os.path.join(output_dir, '%s_stdout.txt' % process_name)
+  stderr_filename = os.path.join(output_dir, '%s_stderr.txt' % process_name)
+  stdout_file = open(stdout_filename, 'w+')
+  stderr_file = open(stderr_filename, 'w+')
+  popen = subprocess.Popen(
+      args, stdout=stdout_file, stderr=stderr_file, env=env)
+  return _ProcessInfo(process_name, popen, stdout_file, stderr_file)
+
+
+def _wait_for_processes(wait_processes, kill_processes):
+  """Waits until all `wait_processes` finish, then kills `kill_processes`.
+
+  Fails an assert if a process in `wait_processes` finishes unsuccessfully.
+  The processes in `kill_processes` are assumed to never finish so they are
+  killed.
+
+  Args:
+    wait_processes: A list of _ProcessInfo tuples. This function will wait
+      for each to finish.
+    kill_processes: A list of _ProcessInfo tuples. Each will be killed once
+      every process in `wait_processes` is finished.
+  Returns:
+    A list of strings, each which is a string of the stdout of a wait process.
+  """
+  wait_process_stdouts = [None] * len(wait_processes)
+  finished_wait_processes = set()
+  while len(finished_wait_processes) < len(wait_processes):
+    for i, wait_process in enumerate(wait_processes):
+      if i in finished_wait_processes:
+        continue
+      ret_code = wait_process.popen.poll()
+      if ret_code is None:
+        continue
+      tf.logging.info('{} finished'.format(wait_process.name))
+      wait_process.stdout.seek(0)
+      wait_process_stdouts[i] = wait_process.stdout.read()
+      tf.logging.info('stdout for {} (last {} chars): {}\n'.format(
+          wait_process.name, MAX_OUTPUT_CHARS,
+          wait_process_stdouts[i][-MAX_OUTPUT_CHARS:]))
+      wait_process.stderr.seek(0)
+      tf.logging.info('stderr for {} (last {} chars): {}\n'.format(
+          wait_process.name, MAX_OUTPUT_CHARS,
+          wait_process.stderr.read()[-MAX_OUTPUT_CHARS:]))
+      assert ret_code == 0, 'Process failed with return code %d' % ret_code
+      finished_wait_processes.add(i)
+    for kill_process in kill_processes:
+      ret_code = kill_process.popen.poll()
+      # kill processes should not end until we kill them.
+      assert ret_code is None, 'Process returned early with code %d' % ret_code
+    time.sleep(0.25)
+  tf.logging.info('All wait processes finished')
+  for i, kill_process in enumerate(kill_processes):
+    # Kill each kill process.
+    kill_process.popen.kill()
+    kill_process.popen.wait()
+    kill_process.stdout.seek(0)
+    tf.logging.info('stdout for {} (last {} chars): {}\n'.format(
+        kill_process.name, MAX_OUTPUT_CHARS,
+        kill_process.stdout.read()[-MAX_OUTPUT_CHARS:]))
+    kill_process.stderr.seek(0)
+    tf.logging.info('stderr for {} (last {} chars): {}\n'.format(
+        kill_process.name, MAX_OUTPUT_CHARS,
+        kill_process.stderr.read()[-MAX_OUTPUT_CHARS:]))
+  return wait_process_stdouts
+
+
+def _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
+                               num_controllers, params):
+  """Run training or evaluation in spawned processes.
+
+  Runs locally if num_workers == 1, num_ps == 0, and num_controllers == 0,
+  otherwise runs in distributed mode. In either case, one process is spawned
+  per worker and ps. Waits for training/evaluation to finish before returning.
+
+  Args:
+    output_dir_path: Relative path where stdout and stderr files will be
+      placed.
+    num_workers: Number of workers to spawn.
+    num_ps: Number of ps processes to spawn.
+    num_controllers: Number of controller processes to spawn (must be 0 or 1).
+    params: Params for BenchmarkCNN in each subprocess.
+  Returns:
+    A list output_list of outputs from all processes that output the
+    images/sec and accuracy. This process is the controller host in
+    distributed_all_reduce, and the workers otherwise. output_list[i] is a
+    list of lines from the ith worker's stdout.
+  """
+  run_distributed = num_workers != 1 or num_ps != 0 or num_controllers != 0
+  if params.variable_update == 'distributed_all_reduce':
+    assert num_controllers == 1 or not run_distributed
+    assert num_ps == 0
+  else:
+    assert num_controllers == 0
+  output_base_dir = platforms_util.get_test_output_dir()
+  output_dir = os.path.join(output_base_dir, output_dir_path)
+  os.makedirs(output_dir)
+  tf.logging.info('Outputs of processes will be outputted to: %s' % output_dir)
+
+  args = platforms_util.get_command_to_run_python_module(
+      'benchmark_cnn_distributed_test_runner')
+  args += _convert_params_to_flags_list(params)
+  if run_distributed:
+    worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+    ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+    controller_ports = [portpicker.pick_unused_port()
+                        for _ in range(num_controllers)]
+    # The numerator is 0.7 instead of 1 to leave some memory for the Cuda
+    # runtime, etc.
+    gpu_memory_frac = 0.7 / num_workers
+    args += [
+        '--gpu_memory_frac_for_testing=%f' % gpu_memory_frac,
+        '--worker_hosts=' + ','.join('localhost:%d' % p for p in worker_ports)
+    ]
+    if num_ps > 0:
+      ps_hosts_str = ','.join('localhost:%d' % p for p in ps_ports)
+      args.append('--ps_hosts=' + ps_hosts_str)
+    else:
+      controller_host_str = ','.join('localhost:%d' % p
+                                     for p in controller_ports)
+      args.append('--controller_host=' + controller_host_str)
+  env = os.environ.copy()
+  # Allow stdout to be viewed before the process ends.
+  env['PYTHONUNBUFFERED'] = '1'
+
+  worker_processes = []
+  ps_processes = []
+  controller_processes = []
+  try:
+    for i in range(num_workers):
+      job_name = 'worker' if run_distributed else ''
+      process = _create_task_process(job_name, i, args, env, output_dir)
+      worker_processes.append(process)
+    # Don't let ps or controller processes use the gpu.
+    env['CUDA_VISIBLE_DEVICES'] = ''
+
+    for i in range(num_ps):
+      process = _create_task_process('ps', i, args, env, output_dir)
+      ps_processes.append(process)
+    for i in range(num_controllers):
+      process = _create_task_process('controller', i, args, env, output_dir)
+      controller_processes.append(process)
+    # If all distributed all reduce mode is being used, the controller process
+    # finishes and the worker processes block forever. Otherwise, the worker
+    # processes finish and the ps processes block forever. We set
+    # wait_processes and kill_processes accordingly.
+    if controller_processes:
+      wait_processes = controller_processes
+      kill_processes = worker_processes
+    else:
+      wait_processes = worker_processes
+      kill_processes = ps_processes
+    outputs = _wait_for_processes(wait_processes, kill_processes)
+  finally:
+    for process in worker_processes + ps_processes + controller_processes:
+      try:
+        process.popen.kill()
+      except OSError:
+        pass  # It's OK (and expected) if the process already exited.
+      process.stdout.close()
+      process.stderr.close()
+  return [output.splitlines() for output in outputs]
+
+
+# When this test class is run, a method will fail about 0.3% of the time with a
+# gRPC error. It is not clear why this occurs.
+# TODO(reedwm): Fix this test class.
+class TfCnnBenchmarksDistributedTest(tf.test.TestCase):
+  """Tests running benchmark_cnn in distributed mode."""
+
+  # We cannot check for a GPU via tf.test.is_gpu_available() before the tests in
+  # this class because it allocates all the GPU memory which would cause the
+  # spawned processes to run out of GPU memory.
+
+  def _test_distributed(self,
+                        test_name,
+                        num_workers,
+                        num_ps,
+                        params,
+                        num_controllers=0,
+                        check_output_values=False,
+                        skip=None):
+    # TODO(reedwm): check_output_values should default to True and be enabled
+    # on every test. See the TODO in benchmark_cnn_test.py.
+    def run_fn(run_type, inner_params):
+      output_dir_path = os.path.join(test_name, run_type)
+      if run_type == 'Evaluation':
+        # Distributed evaluation is not supported, so we use a single process.
+        # We still must spawn another process, because if we evaluate in the
+        # current process, it would allocate the GPU memory causing future test
+        # methods to fail.
+        if inner_params.variable_update == 'distributed_replicated':
+          inner_params = inner_params._replace(variable_update='replicated')
+        return _spawn_benchmark_processes(
+            output_dir_path, num_workers=1, num_ps=0, num_controllers=0,
+            params=inner_params)
+      else:
+        return _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
+                                          num_controllers, inner_params)
+
+    return test_util.train_and_eval(self, run_fn, params,
+                                    check_output_values=check_output_values,
+                                    skip=skip)
+
+  def testParameterServer(self):
+    test_name = 'testParameterServer'
+    params = test_util.get_params(test_name)
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testParameterServerStaged(self):
+    test_name = 'testParameterServerStaged'
+    params = test_util.get_params(test_name)._replace(staged_vars=True)
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testReplicated(self):
+    test_name = 'testReplicated'
+    params = test_util.get_params(test_name)._replace(
+        variable_update='distributed_replicated')
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testAllReducePsgpu(self):
+    test_name = 'testAllReducePsgpu'
+    flags_dict = test_util.get_params(test_name)._replace(
+        variable_update='distributed_all_reduce',
+        all_reduce_spec='psgpu#4')
+    self._test_distributed(test_name, 2, 0, flags_dict, num_controllers=1)
+
+  def testAllReducePscpuXring(self):
+    test_name = 'testAllReducePscpuXring'
+    flags_dict = test_util.get_params(test_name)._replace(
+        variable_update='distributed_all_reduce',
+        all_reduce_spec='pscpu:2k:xring')
+    self._test_distributed(test_name, 2, 0, flags_dict, num_controllers=1)
+
+  def testForwardOnly(self):
+    test_name = 'testForwardOnly'
+    params = test_util.get_params(test_name)._replace(forward_only=True)
+    # Evaluation is not supported with --forward_only, so we set skip='eval'.
+    self._test_distributed(test_name, 2, 2, params, skip='eval')
+
+  def testSingleWorkerAndPs(self):
+    test_name = 'testSingleWorkerAndPs'
+    params = test_util.get_params(test_name)
+    self._test_distributed(test_name, 1, 1, params)
+
+  def testThreeWorkersAndPses(self):
+    test_name = 'testThreeWorkersAndPses'
+    params = test_util.get_params(test_name)
+    self._test_distributed(test_name, 3, 3, params)
+
+  def testOneWorkerThreePses(self):
+    test_name = 'testOneWorkerThreePses'
+    params = test_util.get_params(test_name)
+    self._test_distributed(test_name, 1, 3, params)
+
+  def testThreeWorkersOnePs(self):
+    test_name = 'testThreeWorkersOnePs'
+    params = test_util.get_params(test_name)
+    self._test_distributed(test_name, 3, 1, params)
+
+  def testNoPrintTrainingAccuracy(self):
+    test_name = 'testNoPrintTrainingAccuracy'
+    params = test_util.get_params(test_name)._replace(
+        print_training_accuracy=False)
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testRmspropParameterServer(self):
+    test_name = 'testRmspropParameterServer'
+    params = test_util.get_params(test_name)._replace(optimizer='rmsprop')
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testMomentumReplicated(self):
+    test_name = 'testMomentumReplicated'
+    params = test_util.get_params(test_name)._replace(
+        optimizer='momentum', variable_update='distributed_replicated')
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testNoCrossReplicaSyncParameterServerStaged(self):
+    test_name = 'testNoCrossReplicaSyncParameterServerStaged'
+    params = test_util.get_params(test_name)._replace(
+        staged_vars=True, cross_replica_sync=False)
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testSingleGpu(self):
+    test_name = 'testSingleGpu'
+    params = test_util.get_params(test_name)._replace(num_gpus=1)
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testBatchGroupSize(self):
+    test_name = 'testBatchGroupSize'
+    params = test_util.get_params(test_name)._replace(
+        batch_group_size=4, num_batches=100, num_warmup_batches=5)
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testFp16WithFp32Vars(self):
+    test_name = 'testFp16WithFp32Vars'
+    params = test_util.get_params(test_name)._replace(
+        use_fp16=True, fp16_vars=False)
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testFp16WithFp16Vars(self):
+    test_name = 'testFp16WithFp16Vars'
+    params = test_util.get_params(test_name)._replace(
+        use_fp16=True, fp16_vars=True, fp16_loss_scale=1.)
+    self._test_distributed(test_name, 2, 2, params)
+
+  def testFp16Replicated(self):
+    test_name = 'testFp16Replicated'
+    params = test_util.get_params(test_name)._replace(
+        use_fp16=True, variable_update='distributed_replicated')
+    self._test_distributed(test_name, 2, 2, params)
+
+  @unittest.skip('b/147310862: Fails for unknown reason')
+  def testReplicatedRealData(self):
+    test_name = 'testReplicatedRealData'
+    imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
+                                'fake_tf_record_data')
+    params = test_util.get_params(test_name)._replace(
+        variable_update='distributed_replicated',
+        data_dir=imagenet_dir,
+        data_name='imagenet')
+    self._test_distributed(test_name, 2, 2, params)
+
+
+class DistributedVariableUpdateTest(tf.test.TestCase):
+  """Tests that variables are updated correctly in distributed mode."""
+
+  def _test_variable_update(self,
+                            test_name,
+                            num_workers,
+                            num_ps,
+                            params,
+                            num_controllers=0):
+    """Tests variables are updated correctly when the given params are used."""
+    output_dir_path = os.path.join(test_name, 'variable_update')
+    logs = _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
+                                      num_controllers, params)
+    actual_losses = []
+    for worker_logs in logs:
+      outputs = test_util.get_training_outputs_from_logs(
+          worker_logs, params.print_training_accuracy)
+      actual_losses.append([x.loss for x in outputs])
+
+    inputs = test_util.get_fake_var_update_inputs()
+    expected_losses = test_util.TestCNNModel().manually_compute_losses(
+        inputs, num_workers, params)
+    if params.variable_update == 'distributed_all_reduce':
+      # In distributed all reduce, each step, the controller outputs the average
+      # of the loss from each worker. So we modify expected losses accordingly.
+      # E.g, we change [[1, 2], [4, 5]] to [[2.5, 3.5]]
+      expected_losses = [[sum(losses) / num_workers
+                          for losses in zip(*expected_losses)]]
+    rtol = 3e-2 if params.use_fp16 else 1e-5
+    for worker_actual_losses, worker_expected_losses in zip(actual_losses,
+                                                            expected_losses):
+      self.assertAllClose(worker_actual_losses[:len(worker_expected_losses)],
+                          worker_expected_losses, rtol=rtol, atol=0.)
+
+  def _test_variable_updates(self, test_name, params):
+    """Tests variables are updated correctly with various variable updates."""
+
+    # Unfortunately, distributed parameter server is non-deterministic with
+    # multiple workers, because one worker may write to a variable before
+    # another worker reads it. This probably does not harm training, but it
+    # does mean we cannot easily test that case. So, we use one worker.
+    self._test_variable_update(
+        test_name + '_ps', num_workers=1, num_ps=2, num_controllers=0,
+        params=params._replace(variable_update='parameter_server'))
+
+    self._test_variable_update(
+        test_name + '_rep', num_workers=2, num_ps=1, num_controllers=0,
+        params=params._replace(variable_update='distributed_replicated'))
+
+    self._test_variable_update(
+        test_name + '_allreduce', num_workers=2, num_ps=0, num_controllers=1,
+        params=params._replace(variable_update='distributed_all_reduce',
+                               all_reduce_spec='psgpu#%d' % params.num_gpus))
+
+  def testVarUpdateDefault(self):
+    params = test_util.get_var_update_params()
+    self._test_variable_updates('testVarUpdateDefault', params)
+
+  def testVarUpdateCpuAsLocalParamDevice(self):
+    params = test_util.get_var_update_params()._replace(
+        local_parameter_device='cpu')
+    self._test_variable_updates('testVarUpdateCpuAsLocalParamDevice', params)
+
+  def testVarUpdateFp16(self):
+    params = test_util.get_var_update_params()._replace(use_fp16=True)
+    self._test_variable_updates('testVarUpdateFp16', params)
+
+  def testVarUpdateResourceVars(self):
+    params = test_util.get_var_update_params()._replace(use_resource_vars=True)
+    self._test_variable_updates('testVarUpdateResourceVars', params)
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test_runner.py b/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9291a801e4606c2b1982e5e1e0df833227a45e8f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/benchmark_cnn_distributed_test_runner.py
@@ -0,0 +1,122 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Used to run benchmark_cnn for distributed tests.
+
+In distributed tests, we spawn processes to run tf_cnn_benchmark tasks. We could
+directly spawn tf_cnn_benchmark processes, but we want some added functionality,
+such as being able to inject custom images during training. So instead, this
+file is spawned as a Python process, which supports the added functionality.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags as absl_flags
+import numpy as np
+import tensorflow.compat.v1 as tf
+import benchmark_cnn
+import flags
+import preprocessing
+import test_util
+
+
+absl_flags.DEFINE_string('fake_input', 'none',
+                         """What fake input to inject into benchmark_cnn. This
+                            is ignored if --model=test_model.
+                            Options are:
+                            none: Do not inject any fake input.
+                            zeros_and_ones: Half the images will be all 0s with
+                            a label of 0. Half the images will be all 1s with a
+                            label of 1.""")
+
+flags.define_flags()
+FLAGS = flags.FLAGS
+
+
+def get_test_image_preprocessor(batch_size, params):
+  """Returns the preprocessing.TestImagePreprocessor that should be injected.
+
+  Returns None if no preprocessor should be injected.
+
+  Args:
+    batch_size: The batch size across all GPUs.
+    params: BenchmarkCNN's parameters.
+  Returns:
+    Returns the preprocessing.TestImagePreprocessor that should be injected.
+  Raises:
+    ValueError: Flag --fake_input is an invalid value.
+  """
+  if FLAGS.fake_input == 'none':
+    return None
+  elif FLAGS.fake_input == 'zeros_and_ones':
+    half_batch_size = batch_size // 2
+    images = np.zeros((batch_size, 227, 227, 3), dtype=np.float32)
+    images[half_batch_size:, :, :, :] = 1
+    labels = np.array([0] * half_batch_size + [1] * half_batch_size,
+                      dtype=np.int32)
+    preprocessor = preprocessing.TestImagePreprocessor(
+        batch_size, [227, 227, 3], params.num_gpus,
+        benchmark_cnn.get_data_type(params))
+    preprocessor.set_fake_data(images, labels)
+    preprocessor.expected_subset = 'validation' if params.eval else 'train'
+    return preprocessor
+  else:
+    raise ValueError('Invalid --fake_input: %s' % FLAGS.fake_input)
+
+
+def run_with_real_model(params):
+  """Runs tf_cnn_benchmarks with a real model."""
+  bench = benchmark_cnn.BenchmarkCNN(params)
+  bench.print_info()
+  preprocessor = get_test_image_preprocessor(bench.batch_size, params)
+  if preprocessor is not None:
+    # The test image preprocessor requires queue runners. Since this file is
+    # used for testing, it is OK to access protected members.
+    # pylint: disable=protected-access
+    bench.dataset._queue_runner_required = True
+    # pylint: enable=protected-access
+    bench.input_preprocessor = preprocessor
+  bench.run()
+
+
+def run_with_test_model(params):
+  """Runs tf_cnn_benchmarks with a test model."""
+  model = test_util.TestCNNModel()
+  inputs = test_util.get_fake_var_update_inputs()
+  with test_util.monkey_patch(benchmark_cnn,
+                              LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15):
+    bench = benchmark_cnn.BenchmarkCNN(params, dataset=test_util.TestDataSet(),
+                                       model=model)
+    # The test model does not use labels when computing loss, so the label
+    # values do not matter as long as it's the right shape.
+    labels = np.array([1] * inputs.shape[0])
+    bench.input_preprocessor.set_fake_data(inputs, labels)
+    bench.run()
+
+
+def main(_):
+  params = benchmark_cnn.make_params_from_flags()
+  params = benchmark_cnn.setup(params)
+  if params.model == 'test_model':
+    run_with_test_model(params)
+  else:
+    run_with_real_model(params)
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.app.run()
diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn_test.py b/cv/classification/resnet50/tensorflow/benchmark_cnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e849739c4687e2f53803fdb8d40d9a7e97ccb80
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/benchmark_cnn_test.py
@@ -0,0 +1,1493 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for benchmark_cnn."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import glob
+import os
+import re
+import unittest
+
+import mock
+import numpy as np
+import tensorflow.compat.v1 as tf
+from google.protobuf import text_format
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.core.profiler import tfprof_log_pb2
+from tensorflow.python.platform import test
+import benchmark_cnn
+import datasets
+import flags
+import preprocessing
+import test_util
+import variable_mgr_util
+from platforms import util as platforms_util
+
+
+def _check_has_gpu():
+  if not test.is_gpu_available(cuda_only=True):
+    raise ValueError(
+        """You have asked to run part or all of this on GPU, but it appears
+        that no GPU is available. If your machine has GPUs it is possible you
+        do not have a version of TensorFlow with GPU support. To build with GPU
+        support, add --config=cuda to the build flags.\n """)
+
+
+class TfCnnBenchmarksModelTest(tf.test.TestCase):
+  """Tests which are run with multiple models."""
+
+  def setUp(self):
+    super(TfCnnBenchmarksModelTest, self).setUp()
+    benchmark_cnn.setup(benchmark_cnn.make_params())
+
+  def get_model_name(self):
+    return None
+
+  # Return true to run tests that don't need to be run on every model.
+  # This should be done for one or two cheap models.
+  def extended_tests(self):
+    return False
+
+  # Return false to suppress actually running the model; this is useful
+  # for tests that are large.
+  def model_execution_test(self):
+    return False
+
+  # Return false to suppress actually saving and loading the model.
+  def model_save_load_test(self):
+    return False
+
+  def testSaveLoadModel(self):
+    _check_has_gpu()
+    if not self.get_model_name() or not self.model_save_load_test():
+      return
+
+    params = benchmark_cnn.make_params(
+        model=self.get_model_name(),
+        num_batches=1,
+        num_intra_threads=0,
+        num_inter_threads=0,
+        distortions=False,
+        batch_size=2,
+        variable_update='replicated',
+        num_warmup_batches=0,
+        num_gpus=2,
+        train_dir=test_util.get_temp_dir('testSaveLoadModel_' +
+                                         self.get_model_name()))
+
+    # Run one batch and save the model.
+    # Note that this uses a non-test session.
+    bench = benchmark_cnn.BenchmarkCNN(params)
+    bench.run()
+    self.assertEqual(bench.init_global_step, 0)
+    # Clear the default graph.
+    tf.reset_default_graph()
+    # Test if checkpoint had been saved.
+    ckpt = tf.train.get_checkpoint_state(params.train_dir)
+    match = re.match(os.path.join(params.train_dir, r'model.ckpt-(\d+).index'),
+                     ckpt.model_checkpoint_path + '.index')
+    self.assertTrue(match)
+    self.assertGreaterEqual(int(match.group(1)), params.num_batches)
+    params = params._replace(num_batches=2)
+    # Reload the model
+    bench = benchmark_cnn.BenchmarkCNN(params)
+    bench.run()
+    # Check if global step has been restored.
+    self.assertNotEqual(bench.init_global_step, 0)
+    ckpt = tf.train.get_checkpoint_state(params.train_dir)
+    match = re.match(os.path.join(params.train_dir, r'model.ckpt-(\d+).index'),
+                     ckpt.model_checkpoint_path + '.index')
+    self.assertTrue(match)
+    self.assertGreaterEqual(int(match.group(1)), params.num_batches)
+    # Check that the batch norm moving averages are restored from checkpoints
+    with tf.Graph().as_default():
+      bench = benchmark_cnn.BenchmarkCNN(params)
+      bench._build_model()
+      saver = tf.train.Saver(bench.variable_mgr.savable_variables())
+      with tf.Session(config=benchmark_cnn.create_config_proto(params)) as sess:
+        benchmark_cnn.load_checkpoint(saver, sess, params.train_dir)
+        sess.run(bench.variable_mgr.get_post_init_ops())
+        bn_moving_vars = [
+            v for v in tf.global_variables()
+            if '/batchnorm' in v.name and '/moving' in v.name
+        ]
+        self.assertGreater(len(bn_moving_vars), 0)
+        for moving_var in bn_moving_vars:
+          moving_var_value = sess.run(moving_var)
+          # Check that the moving means and moving variances have been restored
+          # by asserting they are not their default values of 0 and 1,
+          # respectively
+          if '/moving_mean' in moving_var.name:
+            self.assertFalse(np.array_equal(moving_var_value,
+                                            np.zeros(moving_var_value.shape,
+                                                     moving_var_value.dtype)))
+          else:
+            self.assertIn('/moving_variance', moving_var.name)
+            self.assertFalse(np.array_equal(moving_var_value,
+                                            np.ones(moving_var_value.shape,
+                                                    moving_var_value.dtype)))
+
+  def testModel(self):
+    _check_has_gpu()
+    if not self.get_model_name() or not self.model_execution_test():
+      return
+
+    params = benchmark_cnn.make_params(
+        model=self.get_model_name(),
+        num_batches=1,
+        num_intra_threads=1,
+        num_inter_threads=12,
+        batch_size=2,
+        distortions=False)
+
+    # Run this one; note that this uses a non-test session.
+    bench = benchmark_cnn.BenchmarkCNN(params)
+    bench.run()
+
+  def testSendRecvVariables(self):
+    self._testVariables('parameter_server')
+    if self.extended_tests():
+      self._testVariables('parameter_server', local_parameter_device='CPU')
+      self._testVariables('parameter_server', optimizer='sgd')
+
+  def testReplicatedVariables(self):
+    self._testVariables('replicated')
+    if self.extended_tests():
+      self._testVariables('replicated', all_reduce_spec=None)
+      self._testVariables('replicated', use_fp16=True, fp16_vars=False)
+      self._testVariables(
+          'replicated',
+          all_reduce_spec=None,
+          use_fp16=True,
+          fp16_vars=False,
+          fp16_enable_auto_loss_scale=True,
+          fp16_inc_loss_scale_every_n=4)
+
+  def testIndependentVariables(self):
+    self._testVariables('independent')
+    self._testVariables(
+        'independent',
+        all_reduce_spec=None,
+        use_fp16=True,
+        fp16_vars=False,
+        fp16_enable_auto_loss_scale=True,
+        fp16_inc_loss_scale_every_n=4)
+
+  def testSummaryVerbosity(self):
+    self._testVariables('parameter_server', summary_verbosity=1)
+    if self.extended_tests():
+      self._testVariables('parameter_server', summary_verbosity=2)
+      self._testVariables('parameter_server', summary_verbosity=3)
+
+  def testStagedVariables(self):
+    self._testVariables('parameter_server', staged_vars=True)
+    if self.extended_tests():
+      self._testVariables('parameter_server', staged_vars=True,
+                          local_parameter_device='CPU')
+      self._testVariables('parameter_server', staged_vars=True, use_fp16=True,
+                          fp16_vars=True)
+
+  def _assert_correct_var_type(self, var, params):
+    if 'gpu_cached_inputs' not in var.name:
+      if params.use_fp16 and params.fp16_vars and 'batchnorm' not in var.name:
+        expected_type = tf.float16
+      else:
+        expected_type = tf.float32
+      self.assertEqual(var.dtype.base_dtype, expected_type)
+
+  def _testVariables(self,
+                     variable_update,
+                     summary_verbosity=0,
+                     local_parameter_device='GPU',
+                     staged_vars=False,
+                     optimizer='momentum',
+                     # TODO(b/80125832): Enable nccl in tests
+                     # all_reduce_spec='nccl',
+                     all_reduce_spec='',
+                     use_fp16=False,
+                     fp16_vars=False,
+                     fp16_enable_auto_loss_scale=False,
+                     fp16_inc_loss_scale_every_n=10):
+    if not self.get_model_name():
+      return
+    _check_has_gpu()
+
+    params = benchmark_cnn.make_params(
+        model=self.get_model_name(),
+        num_batches=1,
+        num_intra_threads=1,
+        num_inter_threads=12,
+        distortions=False,
+        variable_update=variable_update,
+        local_parameter_device=local_parameter_device,
+        num_gpus=2,
+        summary_verbosity=summary_verbosity,
+        staged_vars=staged_vars,
+        optimizer=optimizer,
+        all_reduce_spec=all_reduce_spec,
+        compact_gradient_transfer=False if all_reduce_spec == 'nccl' else True,
+        use_fp16=use_fp16,
+        fp16_loss_scale=2.,
+        fp16_vars=fp16_vars,
+        fp16_enable_auto_loss_scale=fp16_enable_auto_loss_scale,
+        fp16_inc_loss_scale_every_n=fp16_inc_loss_scale_every_n,
+    )
+
+    # Test building models using multiple GPUs, but don't
+    # run them.
+    with self.test_session(graph=tf.Graph()):
+      bench = benchmark_cnn.BenchmarkCNN(params)
+      bench._build_model()
+
+      # Rough validation of variable type and placement, depending on mode.
+      all_vars = tf.global_variables() + tf.local_variables()
+      if params.variable_update == 'parameter_server':
+        for v in all_vars:
+          tf.logging.debug('var: %s' % v.name)
+          match = re.match(r'tower_(\d+)/v/gpu_cached_inputs:0', v.name)
+          if match:
+            self.assertEqual(v.device, '/device:GPU:%s' % match.group(1))
+          elif v.name.startswith('v/'):
+            self.assertEqual(v.device, '/device:%s:0' % local_parameter_device)
+            self._assert_correct_var_type(v, params)
+          elif v.name in ('input_processing/images:0',
+                          'input_processing/labels:0', 'init_learning_rate:0',
+                          'global_step:0', 'loss_scale:0',
+                          'loss_scale_normal_steps:0'):
+            self.assertEqual(v.device, '/device:CPU:0')
+          else:
+            raise ValueError('Unexpected variable %s' % v.name)
+      else:
+        v0_count = 0
+        v1_count = 0
+        for v in all_vars:
+          if v.name.startswith('tower_0/v0/'):
+            self.assertEqual(v.name, 'tower_0/v0/gpu_cached_inputs:0')
+            self.assertEqual(v.device, '/device:GPU:0')
+          elif v.name.startswith('tower_1/v1/'):
+            self.assertEqual(v.name, 'tower_1/v1/gpu_cached_inputs:0')
+            self.assertEqual(v.device, '/device:GPU:1')
+          elif v.name.startswith('v0/'):
+            v0_count += 1
+            self.assertEqual(v.device, '/device:GPU:0')
+            self._assert_correct_var_type(v, params)
+          elif v.name.startswith('v1/'):
+            v1_count += 1
+            self.assertEqual(v.device, '/device:GPU:1')
+            self._assert_correct_var_type(v, params)
+          elif v.name in ('input_processing/images:0',
+                          'input_processing/labels:0', 'init_learning_rate:0',
+                          'global_step:0', 'loss_scale:0',
+                          'loss_scale_normal_steps:0'):
+            self.assertEqual(v.device, '/device:CPU:0')
+          else:
+            raise ValueError('Unexpected variable %s' % v.name)
+        self.assertEqual(v0_count, v1_count)
+
+      # Validate summary ops in the model depending on verbosity level
+      summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
+      num_summary_ops = len(summary_ops)
+      self.assertEqual(num_summary_ops > 0, summary_verbosity > 0)
+      if summary_verbosity > 0:
+        has_affine_histogram = False
+        has_gradient_histogram = False
+        has_log_gradients_histogram = False
+        for op in summary_ops:
+          if '/gradients' in op.name:
+            has_gradient_histogram = True
+          elif '/affine' in op.name:
+            has_affine_histogram = True
+          elif 'log_gradients' in op.name:
+            has_log_gradients_histogram = True
+        self.assertEqual(summary_verbosity >= 3, has_affine_histogram)
+        self.assertEqual(summary_verbosity >= 3, has_gradient_histogram)
+        self.assertEqual(summary_verbosity >= 2, has_log_gradients_histogram)
+        if summary_verbosity == 1:
+          self.assertLess(num_summary_ops, 10)
+
+
+class TrivialModelTest(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'trivial'
+
+
+class TestVgg1Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'vgg11'
+
+
+class TestVgg19Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'vgg19'
+
+
+class TestLenet5Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'lenet'
+
+
+class TestGooglenetModel(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'googlenet'
+
+
+class TestOverfeatModel(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'overfeat'
+
+
+class TestAlexnetModel(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'alexnet'
+
+  def extended_tests(self):
+    return True
+
+
+class TestTrivialModel(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'trivial'
+
+
+class TestInceptionv3Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'inception3'
+
+  def extended_tests(self):
+    return True
+
+
+class TestInceptionv4Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'inception4'
+
+
+class TestResnet50Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'resnet50'
+
+  def model_save_load_test(self):
+    return True
+
+
+class TestResnet101Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'resnet101'
+
+
+class TestResnet152Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'resnet152'
+
+
+class TestResnet50V2Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'resnet50_v2'
+
+
+class TestResnet101V2Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'resnet101_v2'
+
+
+class TestResnet152V2Model(TfCnnBenchmarksModelTest):
+
+  def get_model_name(self):
+    return 'resnet152_v2'
+
+
+class TfCnnBenchmarksTest(tf.test.TestCase):
+  """Tests that benchmark_cnn runs correctly."""
+
+  def setUp(self):
+    super(TfCnnBenchmarksTest, self).setUp()
+    _check_has_gpu()
+    benchmark_cnn.setup(benchmark_cnn.make_params())
+
+  def _run_benchmark_cnn(self, params):
+    logs = []
+    benchmark_cnn.log_fn = test_util.print_and_add_to_list(logs)
+    benchmark_cnn.BenchmarkCNN(params).run()
+    return logs
+
+  def _run_benchmark_cnn_with_fake_images(self, params, images, labels):
+    logs = []
+    benchmark_cnn.log_fn = test_util.print_and_add_to_list(logs)
+    bench = benchmark_cnn.BenchmarkCNN(params)
+    bench.input_preprocessor = preprocessing.TestImagePreprocessor(
+        params.batch_size * params.num_gpus,
+        [[params.batch_size, 227, 227, 3], [params.batch_size]],
+        params.num_gpus,
+        bench.model.data_type)
+    bench.dataset._queue_runner_required = True
+    bench.input_preprocessor.set_fake_data(images, labels)
+    bench.input_preprocessor.expected_subset = ('validation'
+                                                if params.eval else 'train')
+    bench.run()
+    return logs
+
+  def _run_benchmark_cnn_with_black_and_white_images(self, params):
+    """Runs BenchmarkCNN with black and white images.
+
+    A BenchmarkCNN is created and run with black and white images as input. Half
+    the images are black (i.e., filled with 0s) and half are white (i.e., filled
+    with 255s).
+
+    Args:
+      params: Params for BenchmarkCNN.
+
+    Returns:
+      A list of lines from the output of BenchmarkCNN.
+    """
+    # TODO(reedwm): Instead of generating images here, use black and white
+    # tfrecords by calling test_util.create_black_and_white_images().
+    effective_batch_size = params.batch_size * params.num_gpus
+    half_batch_size = effective_batch_size // 2
+    images = np.zeros((effective_batch_size, 227, 227, 3), dtype=np.float32)
+    images[half_batch_size:, :, :, :] = 255
+    labels = np.array([0] * half_batch_size + [1] * half_batch_size,
+                      dtype=np.int32)
+    return self._run_benchmark_cnn_with_fake_images(params, images, labels)
+
+  def _train_and_eval_local(self,
+                            params,
+                            check_output_values=False,
+                            max_final_loss=10.,
+                            skip=None,
+                            use_test_preprocessor=True):
+    # TODO(reedwm): check_output_values should default to True and be enabled
+    # on every test. Currently, if check_output_values=True and the calls to
+    # tf.set_random_seed(...) and np.seed(...) are passed certain seed values in
+    # benchmark_cnn.py, then most tests will fail. This indicates the tests
+    # are brittle and could fail with small changes when
+    # check_output_values=True, so check_output_values defaults to False for
+    # now.
+
+    def run_fn(run_type, inner_params):
+      del run_type
+      if use_test_preprocessor:
+        return [
+            self._run_benchmark_cnn_with_black_and_white_images(inner_params)
+        ]
+      else:
+        return [self._run_benchmark_cnn(inner_params)]
+
+    return test_util.train_and_eval(self, run_fn, params,
+                                    check_output_values=check_output_values,
+                                    max_final_loss=max_final_loss,
+                                    skip=skip)
+
+  def testAlexnet(self):
+    params = test_util.get_params('testAlexnet')._replace(
+        num_batches=30, init_learning_rate=0.01, model='alexnet')
+    self._train_and_eval_local(params)
+
+  def testNoPrintAccuracy(self):
+    params = test_util.get_params('testNoPrintAccuracy')._replace(
+        print_training_accuracy=False)
+    self._train_and_eval_local(params)
+
+  def testLowAccuracy(self):
+    params = test_util.get_params('testLowAccuracy')._replace(
+        print_training_accuracy=True, batch_size=5, num_batches=10)
+    # We force low accuracy by having each batch containing 10 identical images,
+    # each with a different label. This guarantees a top-1 accuracy of exactly
+    # 0.1 and a top-5 accuracy of exactly 0.5.
+    images = np.zeros((10, 227, 227, 3), dtype=np.float32)
+    labels = np.arange(10, dtype=np.int32)
+    logs = self._run_benchmark_cnn_with_fake_images(params, images, labels)
+    training_outputs = test_util.get_training_outputs_from_logs(
+        logs, params.print_training_accuracy)
+    last_output = training_outputs[-1]
+    # TODO(reedwm): These should be assertEqual but for some reason,
+    # occasionally the accuracies are lower (Running this test 500 times, these
+    # asserts failed twice). Investigate this problem.
+    self.assertLessEqual(last_output.top_1_accuracy, 0.1)
+    self.assertLessEqual(last_output.top_5_accuracy, 0.5)
+
+  def testParameterServer(self):
+    params = test_util.get_params('testParameterServer')
+    self._train_and_eval_local(params)
+
+  def testParameterServerStaged(self):
+    params = test_util.get_params('testParameterServerStaged')._replace(
+        staged_vars=True)
+    self._train_and_eval_local(params)
+
+  def testReplicated(self):
+    params = test_util.get_params('testReplicated')._replace(
+        variable_update='replicated')
+    self._train_and_eval_local(params)
+
+  def testIndependent(self):
+    params = test_util.get_params('testIndependent')._replace(
+        variable_update='independent')
+    self._train_and_eval_local(params)
+
+  def testForwardOnly(self):
+    params = test_util.get_params('testForwardOnly')._replace(forward_only=True)
+    # Evaluation is not supported with --forward_only, so we set skip='eval'.
+    self._train_and_eval_local(params, skip='eval')
+
+  def testForwardOnlyAndFreeze(self):
+    params = test_util.get_params('testForwardOnlyAndFreeze')._replace(
+        forward_only=True, freeze_when_forward_only=True, train_dir=None)
+    # Training is not supported with --freeze_when_forward_only.
+    self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
+
+  def testNoDistortions(self):
+    params = test_util.get_params('testNoDistortions')._replace(
+        distortions=False)
+    self._train_and_eval_local(params)
+
+  def testCpuAsLocalParamDevice(self):
+    params = test_util.get_params('testCpuAsLocalParamDevice')._replace(
+        local_parameter_device='cpu')
+    self._train_and_eval_local(params)
+
+  def testNHWC(self):
+    params = test_util.get_params('testNHWC')._replace(data_format='NHWC')
+    self._train_and_eval_local(params)
+
+  def testCpuAsDevice(self):
+    params = test_util.get_params('testCpuAsDevice')._replace(
+        device='cpu', data_format='NHWC')  # NHWC required when --device=cpu
+    self._train_and_eval_local(params)
+
+  def testMomentumParameterServer(self):
+    params = test_util.get_params('testMomentumParameterServer')._replace(
+        optimizer='momentum', momentum=0.8)
+    self._train_and_eval_local(params)
+
+  def testRmspropReplicated(self):
+    params = test_util.get_params('testRmspropReplicated')._replace(
+        variable_update='replicated',
+        optimizer='rmsprop',
+        rmsprop_decay=0.8,
+        rmsprop_momentum=0.6,
+        rmsprop_epsilon=0.7,
+        init_learning_rate=0.01)
+    self._train_and_eval_local(params)
+
+  def testBatchGroupSize(self):
+    params = test_util.get_params('testBatchGroupSize')._replace(
+        batch_group_size=4, num_batches=100, num_warmup_batches=5)
+    self._train_and_eval_local(params)
+
+  def testGradientClip(self):
+    params = test_util.get_params('testGradientClip')._replace(
+        gradient_clip=100.0)
+    self._train_and_eval_local(params)
+
+  def testWeightDecay(self):
+    params = test_util.get_params('testWeightDecay')._replace(
+        weight_decay=0.0001)
+    self._train_and_eval_local(params)
+
+  def testNoLayers(self):
+    params = test_util.get_params('testNoLayers')._replace(use_tf_layers=False)
+    self._train_and_eval_local(params)
+
+  def testSaveModelSteps(self):
+    params = test_util.get_params('testSaveModelSteps')._replace(
+        save_model_steps=2, num_warmup_batches=0, num_batches=10,
+        max_ckpts_to_keep=3)
+    self._train_and_eval_local(params)
+    for i in range(1, 20 + 1):
+      # We train for 20 steps, since self._train_and_eval_local() does two
+      # training runs of 10 steps each. We save a checkpoint every 2 steps and
+      # keep the last 3 checkpoints, so at the end, we should have checkpoints
+      # for steps 16, 18, and 20.
+      matches = glob.glob(os.path.join(params.train_dir,
+                                       'model.ckpt-{}.*'.format(i)))
+      if i in (16, 18, 20):
+        self.assertTrue(matches)
+      else:
+        self.assertFalse(matches)
+
+  def testFp16WithFp32Vars(self):
+    params = test_util.get_params('testFp16WithFp32Vars')._replace(
+        use_fp16=True, fp16_vars=False, fp16_loss_scale=1.)
+    self._train_and_eval_local(params)
+
+  def testFp16WithFp16Vars(self):
+    params = test_util.get_params('testFp16WithFp16Vars')._replace(
+        use_fp16=True, fp16_vars=True)
+    self._train_and_eval_local(params)
+
+  def testXlaCompile(self):
+    params = test_util.get_params('testXlaCompile')._replace(xla_compile=True)
+    self._train_and_eval_local(params)
+
+  @unittest.skip('Fails for unknown reason')
+  def testXlaCompileWithFp16(self):
+    params = test_util.get_params('testXlaCompileWithFp16')._replace(
+        use_fp16=True, xla_compile=True)
+    self._train_and_eval_local(params)
+
+  def testGradientRepacking(self):
+    params = test_util.get_params('testGradientRepacking1')._replace(
+        gradient_repacking=2)
+    self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
+    params = test_util.get_params('testGradientRepacking2')._replace(
+        gradient_repacking=2, use_fp16=True)
+    self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
+
+  def testTraceFileChromeTraceFormat(self):
+    trace_file = os.path.join(self.get_temp_dir(),
+                              'testTraceFileChromeTraceFormat_tracefile')
+    params = test_util.get_params('testTraceFileChromeTraceFormat')._replace(
+        trace_file=trace_file, use_chrome_trace_format=True)
+    self._train_and_eval_local(params)
+    self.assertGreater(os.stat(trace_file).st_size, 0)
+
+  def testTraceFileStepStatsProto(self):
+    trace_file = os.path.join(self.get_temp_dir(),
+                              'testTraceFileStepStatsProto_tracefile')
+    params = test_util.get_params('testTraceFileStepStatsProto')._replace(
+        trace_file=trace_file, use_chrome_trace_format=False)
+    self._train_and_eval_local(params)
+    self.assertGreater(os.stat(trace_file).st_size, 0)
+    with open(trace_file) as f:
+      step_stats = step_stats_pb2.StepStats()
+      # The following statement should not raise an exception.
+      contents = f.read()
+      text_format.Merge(contents, step_stats)
+
+  def testTfprofFile(self):
+    tfprof_file = os.path.join(self.get_temp_dir(), 'testTfprofFile_tfproffile')
+    params = test_util.get_params('testTfprofFile')._replace(
+        tfprof_file=tfprof_file)
+    self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
+    self.assertGreater(os.stat(tfprof_file).st_size, 0)
+    with open(tfprof_file, 'rb') as f:
+      profile_proto = tfprof_log_pb2.ProfileProto()
+      # The following statement should not raise an exception.
+      profile_proto.ParseFromString(f.read())
+
+  @unittest.skip('Fails for unknown reason')
+  def testMoveTrainDir(self):
+    params = test_util.get_params('testMoveTrainDir')
+    self._train_and_eval_local(params)
+    new_train_dir = params.train_dir + '_moved'
+    os.rename(params.train_dir, new_train_dir)
+    params = params._replace(train_dir=new_train_dir, eval=True)
+    self._run_benchmark_cnn_with_black_and_white_images(params)
+
+  @mock.patch('tensorflow.compat.v1.train.Saver')
+  @mock.patch('benchmark_cnn._get_checkpoint_to_load')
+  def testLoadCheckpoint(self, mock_checkpoint_to_load, mock_saver):
+    """Tests load checkpoint with full path to checkpoint."""
+    expected_checkpoint = '/path/to/checkpoints/model.ckpt-1243'
+    mock_checkpoint_to_load.return_value = expected_checkpoint
+
+    global_batch = benchmark_cnn.load_checkpoint(mock_saver,
+                                                 None,
+                                                 expected_checkpoint)
+    self.assertEqual(global_batch, 1243)
+
+  def testGetCheckpointToLoadFullPath(self):
+    """Tests passing full path."""
+    ckpt_path = '/foo/bar/model.ckpt-189'
+    full_path = benchmark_cnn._get_checkpoint_to_load(ckpt_path)
+    self.assertEqual(full_path, ckpt_path)
+
+  def testGetCheckpointToLoadException(self):
+    """Tests exception for directory without a checkpoint."""
+    ckpt_path = '/foo/bar/checkpoints'
+    self.assertRaises(benchmark_cnn.CheckpointNotFoundException,
+                      benchmark_cnn._get_checkpoint_to_load, ckpt_path)
+
+  @mock.patch('tensorflow.compat.v1.train.get_checkpoint_state')
+  def testGetCheckpointToLoad(self, mock_checkpoint_state):
+    """Tests passing path to checkpoint folder."""
+    expected_checkpoint = '/path/to/checkpoints/model.ckpt-1243'
+    mock_checkpoint_state.return_value = mock.Mock(
+        model_checkpoint_path=expected_checkpoint)
+    ckpt_path = '/path/to/checkpoints/'
+    full_path = benchmark_cnn._get_checkpoint_to_load(ckpt_path)
+    self.assertEqual(full_path, expected_checkpoint)
+
+  def testImagenetPreprocessor(self):
+    imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
+                                'fake_tf_record_data')
+    params = test_util.get_params('testImagenetPreprocessor')._replace(
+        data_dir=imagenet_dir, data_name='imagenet')
+    self._train_and_eval_local(params, use_test_preprocessor=False)
+
+  def testImagenetPreprocessorNoDistortions(self):
+    imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
+                                'fake_tf_record_data')
+    params = test_util.get_params(
+        'testImagenetPreprocessorNoDistortions')._replace(
+            data_dir=imagenet_dir, data_name='imagenet', distortions=False)
+    self._train_and_eval_local(params, use_test_preprocessor=False)
+
+  def testImagenetPreprocessorVerboseSummary(self):
+    imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
+                                'fake_tf_record_data')
+    params = test_util.get_params(
+        'testImagenetPreprocessorVerboseSummary')._replace(
+            data_dir=imagenet_dir, data_name='imagenet', distortions=False,
+            summary_verbosity=2)
+    self._train_and_eval_local(params, use_test_preprocessor=False)
+
+  def testCifar10SyntheticData(self):
+    params = test_util.get_params('testCifar10SyntheticData')._replace(
+        data_name='cifar10')
+    self._train_and_eval_local(params)
+
+  def testShiftRatio(self):
+    test_util.monkey_patch_base_cluster_manager()
+    params = benchmark_cnn.make_params(
+        data_name='imagenet',
+        data_dir=os.path.join(platforms_util.get_test_data_dir(),
+                              'fake_tf_record_data'),
+        job_name='worker',
+        worker_hosts='w1,w2,w3,w4',
+        ps_hosts='p1',
+        task_index=0)
+    self.assertEqual(
+        benchmark_cnn.BenchmarkCNN(params).input_preprocessor.shift_ratio, 0.0)
+    params = params._replace(task_index=3)
+    self.assertEqual(
+        benchmark_cnn.BenchmarkCNN(params).input_preprocessor.shift_ratio, 0.75)
+
+  def testDistributedReplicatedSavableVars(self):
+    test_util.monkey_patch_base_cluster_manager()
+    params = benchmark_cnn.make_params(
+        variable_update='distributed_replicated',
+        model='inception4',
+        data_name='imagenet',
+        data_dir=os.path.join(platforms_util.get_test_data_dir(),
+                              'fake_tf_record_data'),
+        job_name='worker',
+        worker_hosts='w1,w2,w3,w4',
+        ps_hosts='p1',
+        datasets_use_prefetch=False)
+
+    bench = benchmark_cnn.BenchmarkCNN(params)
+    with tf.Graph().as_default():
+      bench._build_model()
+      savable_vars = bench.variable_mgr.savable_variables()
+      # Assert all global variables are in savable_vars
+      for v in tf.global_variables():
+        if not v.name.startswith(
+            variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'):
+          self.assertEqual(v.name, 'global_step:0')
+        name = bench.variable_mgr._strip_port(v.name)
+        if name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX):
+          name = name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):]
+        self.assertIn(name, savable_vars)
+        self.assertIn(savable_vars[name], tf.global_variables())
+      # Assert all local variables on the first tower are in savable_vars
+      for v in tf.local_variables():
+        if v.name.startswith('v0/'):
+          name = bench.variable_mgr._strip_port(v.name)
+          self.assertIn(name, savable_vars)
+
+  def _test_preprocessing_eval(self, image_height, image_width, output_height,
+                               output_width):
+    image = tf.fill((image_height, image_width, 3),
+                    tf.constant(128, dtype=tf.uint8))
+    params = benchmark_cnn.make_params()
+    new_image = preprocessing.eval_image(image, output_height, output_width, 0,
+                                         'bilinear', params.summary_verbosity)
+    with self.test_session() as sess:
+      new_image_value = sess.run(new_image)
+    self.assertAllEqual(new_image_value,
+                        np.full((output_height, output_width, 3), 128,
+                                dtype=np.uint8))
+
+  def testPreprocessingEval(self):
+    self._test_preprocessing_eval(10, 10, 4, 4)
+    self._test_preprocessing_eval(4, 4, 10, 10)
+    self._test_preprocessing_eval(1, 100, 100, 1)
+    self._test_preprocessing_eval(100, 1, 1, 100)
+    self._test_preprocessing_eval(1, 100, 1, 100)
+
+  def _test_preprocessing_traing(self, image_buf, image_color,
+                                 output_height, output_width, bbox,
+                                 batch_position, resize_method, distortions,
+                                 summary_verbosity, fuse_decode_and_crop):
+    new_image = preprocessing.train_image(
+        image_buf,
+        output_height,
+        output_width,
+        bbox,
+        batch_position,
+        resize_method,
+        distortions,
+        summary_verbosity=summary_verbosity,
+        fuse_decode_and_crop=fuse_decode_and_crop)
+    self.assertEqual(new_image.shape, [output_height, output_width, 3])
+    with self.test_session(use_gpu=True) as sess:
+      new_image_value = sess.run(new_image)
+    self.assertAllClose(
+        new_image_value,
+        np.full(
+            [output_height, output_width, 3],
+            image_color,
+            dtype=np.float32),
+        atol=50.,
+        rtol=0.)
+
+  def testPreprocessingTrain(self):
+    test_data_dir = os.path.join(platforms_util.get_test_data_dir(), 'images')
+    black_file = os.path.join(test_data_dir, 'black_image.jpg')
+    with open(black_file, 'rb') as f:
+      black_jpg_buffer = f.read()
+    white_file = os.path.join(test_data_dir, 'white_image.jpg')
+    with open(white_file, 'rb') as f:
+      white_jpg_buffer = f.read()
+    bbox = tf.zeros((1, 0, 4), dtype=tf.float32)
+    batch_position = 0
+    # Each size config is (output_height, output_width, resize_method)
+    size_configs = [(100, 100, 'round_robin'), (150, 10, 'bilinear'),
+                    (10, 150, 'nearest')]
+    # Each image config is (image_buf, image_color)
+    image_configs = [(white_jpg_buffer, 255), (black_jpg_buffer, 0)]
+    for (image_buf, image_color) in image_configs:
+      for output_height, output_width, resize_method in size_configs:
+        for distortions in [True, False]:
+          for summary_verbosity in [0, 2]:
+            for fuse_decode_and_crop in [True, False]:
+              self._test_preprocessing_traing(
+                  image_buf, image_color, output_height, output_width, bbox,
+                  batch_position, resize_method, distortions, summary_verbosity,
+                  fuse_decode_and_crop)
+
+  def _test_learning_rate(self, params, global_step_to_expected_learning_rate):
+    self.longMessage = True  # pylint: disable=invalid-name
+    bench = benchmark_cnn.BenchmarkCNN(params)
+    with tf.Graph().as_default() as graph:
+      bench._build_model()
+      global_step = graph.get_tensor_by_name('global_step:0')
+      learning_rate = graph.get_tensor_by_name('learning_rate_tensor:0')
+      with self.test_session(graph=graph, use_gpu=True) as sess:
+        items = global_step_to_expected_learning_rate.items()
+        for global_step_val, expected_learning_rate in items:
+          self.assertAlmostEqual(sess.run(learning_rate,
+                                          {global_step: global_step_val}),
+                                 expected_learning_rate,
+                                 msg='at global_step:{}'.
+                                 format(global_step_val))
+
+  def testLearningRateModelSpecificResNet(self):
+    params = benchmark_cnn.make_params(model='resnet50',
+                                       batch_size=256,
+                                       variable_update='parameter_server',
+                                       num_gpus=1)
+    self._test_learning_rate(params, {
+        0: 0,
+        150136: 0.128,
+        150137: 0.0128,
+        300273: 0.0128,
+        300274: 0.00128,
+        10000000: 0.0000128
+    })
+
+  def testLearningRateUserProvidedInitLr(self):
+    params = benchmark_cnn.make_params(model='resnet50',
+                                       batch_size=256,
+                                       variable_update='replicated',
+                                       init_learning_rate=1.)
+    self._test_learning_rate(params, {
+        0: 1.,
+        10000000: 1.
+    })
+
+  def testLearningRateUserProvidedInitLrAndWarmup(self):
+    params = benchmark_cnn.make_params(model='resnet50',
+                                       batch_size=256,
+                                       variable_update='replicated',
+                                       init_learning_rate=1.,
+                                       num_learning_rate_warmup_epochs=5)
+    self._test_learning_rate(params, {
+        0: 0.,
+        12511: 0.5,
+        25022: 1.,
+        10000000: 1.
+    })
+
+  def testLearningRateUserProvidedDecayInfo(self):
+    params = benchmark_cnn.make_params(model='resnet50',
+                                       init_learning_rate=1.,
+                                       learning_rate_decay_factor=0.5,
+                                       num_epochs_per_decay=2,
+                                       minimum_learning_rate=0.3750,
+                                       batch_size=32)
+    self._test_learning_rate(params, {
+        0: 1.,
+        80071: 1.,
+        80072: 0.5,
+        160143: 0.5,
+        160144: 0.375,
+        10000000: 0.375
+    })
+
+  def testLearningRateUserProvidedZeroDecay(self):
+    params = benchmark_cnn.make_params(model='resnet50',
+                                       num_learning_rate_warmup_epochs=0,
+                                       learning_rate_decay_factor=0.5,
+                                       num_epochs_per_decay=0,
+                                       minimum_learning_rate=0.3750,
+                                       batch_size=32)
+    with self.assertRaises(ValueError):
+      with tf.Graph().as_default():
+        # This will fail because params.learning_rate_decay_factor cannot be
+        # nonzero if params.num_epochs_per_decay is zero.
+        benchmark_cnn.BenchmarkCNN(params)._build_model()
+
+  def testLearningRateUserProvidedSchedule(self):
+    params = benchmark_cnn.make_params(
+        model='trivial',
+        batch_size=32,
+        piecewise_learning_rate_schedule='1;3;.1;5;.01')
+    self._test_learning_rate(params, {
+        0: 1.,
+        120108: 1.,
+        120109: 0.1,
+        200181: 0.1,
+        200182: 0.01,
+        100000000: 0.01
+    })
+
+  def testNumBatchesAndEpochs(self):
+    params = benchmark_cnn.make_params()
+    batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 10, 100)
+    self.assertEqual(batches, benchmark_cnn._DEFAULT_NUM_BATCHES)
+    self.assertAlmostEqual(epochs,
+                           float(benchmark_cnn._DEFAULT_NUM_BATCHES) / 10)
+
+    params = benchmark_cnn.make_params(num_batches=21)
+    batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 25, 50)
+    self.assertEqual(batches, 21)
+    self.assertAlmostEqual(epochs, 10.5)
+
+    params = benchmark_cnn.make_params(num_epochs=3)
+    batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 2, 3)
+    self.assertEqual(batches, 5)
+    self.assertAlmostEqual(epochs, 10./3.)
+
+    params = benchmark_cnn.make_params(num_epochs=4)
+    batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 2, 3)
+    self.assertEqual(batches, 6)
+    self.assertAlmostEqual(epochs, 4)
+
+    with self.assertRaises(ValueError):
+      params = benchmark_cnn.make_params(num_batches=100, num_epochs=100)
+      benchmark_cnn.get_num_batches_and_epochs(params, 1, 1)
+
+  def _testEvalDuringTraining(self, params, expected_num_eval_batches_found):
+    # The idea of this test is that all train images are black and all eval
+    # images are white. We pass the images through the TestModel, and ensure
+    # the outputs are as expected.
+
+    batch_size = params.batch_size
+    eval_batch_size = params.eval_batch_size or params.batch_size
+
+    class TestModel(test_util.TestCNNModel):
+
+      def __init__(self):
+        super(TestModel, self).__init__()
+        self.depth = 3
+
+      def add_inference(self, cnn):
+        if cnn.phase_train:
+          # This will allow us to test that 100 is only added during training
+          # and not during eval.
+          cnn.top_layer += 100
+          assert cnn.top_layer.shape[0] == batch_size
+        else:
+          assert cnn.top_layer.shape[0] == eval_batch_size
+
+        # Reduce the image to a single number. The number should be (-1 + 100)
+        # during training and 1 during testing.
+        cnn.top_layer = tf.reshape(cnn.top_layer, (cnn.top_layer.shape[0], -1))
+        cnn.top_layer = tf.reduce_mean(cnn.top_layer, axis=1)
+        cnn.top_layer = tf.reshape(cnn.top_layer,
+                                   (cnn.top_layer.shape[0], 1, 1, 1))
+        cnn.top_size = 1
+        trainable_vars = tf.trainable_variables()
+
+        # The super method will compute image*A*B, where A=1 and B=2.
+        super(TestModel, self).add_inference(cnn)
+
+        if not cnn.phase_train:
+          # Assert no new variables were added, since they should be reused from
+          # training.
+          assert len(trainable_vars) == len(tf.trainable_variables())
+
+    model = TestModel()
+    dataset = datasets.ImagenetDataset(params.data_dir)
+    logs = []
+    bench_cnn = benchmark_cnn.BenchmarkCNN(params, model=model, dataset=dataset)
+    with test_util.monkey_patch(benchmark_cnn,
+                                log_fn=test_util.print_and_add_to_list(logs)):
+      bench_cnn.run()
+    training_outputs = test_util.get_training_outputs_from_logs(
+        logs, print_training_accuracy=False)
+    self.assertEqual(len(training_outputs), params.num_batches)
+    expected_training_output = (-1 + 100) * 1 * 2
+    for training_output in training_outputs:
+      self.assertEqual(training_output.loss, expected_training_output)
+    eval_outputs = test_util.get_evaluation_outputs_from_logs(logs)
+    self.assertTrue(eval_outputs)
+    expected_eval_output = 1 * 1 * 2
+    for eval_output in eval_outputs:
+      self.assertEqual(eval_output.top_1_accuracy, expected_eval_output)
+      self.assertEqual(eval_output.top_5_accuracy, expected_eval_output)
+
+    num_eval_batches_found = 0
+    eval_batch_regex = re.compile(r'^\d+\t[0-9.]+ examples/sec$')
+    for log in logs:
+      if eval_batch_regex.match(log):
+        num_eval_batches_found += 1
+    self.assertEqual(num_eval_batches_found, expected_num_eval_batches_found)
+
+  def testEvalDuringTraining(self):
+    data_dir = test_util.create_black_and_white_images()
+    base_params = test_util.get_params('testEvalDuringTraining')
+    train_dir = base_params.train_dir
+    base_params = base_params._replace(
+        train_dir=None, print_training_accuracy=False, num_warmup_batches=0,
+        num_batches=7, num_eval_batches=2, display_every=1,
+        init_learning_rate=0, weight_decay=0,
+        distortions=False, data_dir=data_dir)
+    expected_num_eval_batches_found = (
+        base_params.num_eval_batches * (base_params.num_batches // 2 + 1))
+
+    # Test --eval_during_training_every_n_steps
+    self._testEvalDuringTraining(
+        base_params._replace(eval_during_training_every_n_steps=2,
+                             variable_update='parameter_server'),
+        expected_num_eval_batches_found)
+    self._testEvalDuringTraining(
+        base_params._replace(eval_during_training_every_n_steps=2,
+                             variable_update='replicated'),
+        expected_num_eval_batches_found)
+    self._testEvalDuringTraining(
+        base_params._replace(eval_during_training_every_n_steps=2,
+                             variable_update='replicated',
+                             summary_verbosity=2,
+                             save_summaries_steps=2,
+                             datasets_use_prefetch=False),
+        expected_num_eval_batches_found)
+    self._testEvalDuringTraining(
+        base_params._replace(eval_during_training_every_n_steps=2,
+                             variable_update='replicated',
+                             use_fp16=True, train_dir=train_dir,
+                             eval_batch_size=base_params.batch_size + 2),
+        expected_num_eval_batches_found)
+
+    # Test --eval_during_training_every_n_epochs
+    every_n_epochs = (2 * base_params.batch_size * base_params.num_gpus /
+                      datasets.IMAGENET_NUM_TRAIN_IMAGES)
+    self._testEvalDuringTraining(
+        base_params._replace(eval_during_training_every_n_epochs=every_n_epochs,
+                             variable_update='replicated'),
+        expected_num_eval_batches_found)
+
+    # Test --eval_during_training_at_specified_steps
+    list_steps = [2, 3, 5, 7, 1000]
+    num_eval_steps = 1 + sum(1 for step in list_steps
+                             if step < base_params.num_batches)
+    expected_num_eval_batches_found = (
+        base_params.num_eval_batches * num_eval_steps)
+
+    self._testEvalDuringTraining(
+        base_params._replace(eval_during_training_at_specified_steps=list_steps,
+                             variable_update='replicated'),
+        expected_num_eval_batches_found)
+
+    # Test --eval_during_training_at_specified_epochs
+    list_epochs = [(step * base_params.batch_size * base_params.num_gpus /
+                    datasets.IMAGENET_NUM_TRAIN_IMAGES)
+                   for step in list_steps]
+    self._testEvalDuringTraining(
+        base_params._replace(
+            eval_during_training_at_specified_epochs=list_epochs,
+            variable_update='replicated'),
+        expected_num_eval_batches_found)
+
+    # Test --eval_during_training_every_n_steps runs with synthetic data.
+    params = base_params._replace(
+        variable_update='replicated', data_dir=None,
+        eval_during_training_every_n_steps=2, num_batches=2)
+    benchmark_cnn.BenchmarkCNN(params).run()
+
+  def testEvalDuringTrainingNumEpochs(self):
+    params = benchmark_cnn.make_params(
+        batch_size=1, eval_batch_size=2, eval_during_training_every_n_steps=1,
+        num_batches=30, num_eval_epochs=100 / datasets.IMAGENET_NUM_VAL_IMAGES)
+    bench_cnn = benchmark_cnn.BenchmarkCNN(params)
+    self.assertEqual(bench_cnn.num_batches, 30)
+    self.assertAlmostEqual(bench_cnn.num_epochs,
+                           30 / datasets.IMAGENET_NUM_TRAIN_IMAGES)
+    self.assertAlmostEqual(bench_cnn.num_eval_batches, 50)
+    self.assertAlmostEqual(bench_cnn.num_eval_epochs,
+                           100 / datasets.IMAGENET_NUM_VAL_IMAGES)
+
+  def testEarlyStopping(self):
+    params = benchmark_cnn.make_params(
+        batch_size=2,
+        display_every=1,
+        num_batches=100,
+        eval_during_training_every_n_steps=2,
+        stop_at_top_1_accuracy=0.4,
+    )
+    with mock.patch.object(benchmark_cnn.BenchmarkCNN, '_eval_once',
+                           side_effect=[(0.1, 0.1), (0.5, 0.5), (0.2, 0.2)]
+                          ) as mock_eval_once:
+      logs = []
+      bench_cnn = benchmark_cnn.BenchmarkCNN(params)
+      with test_util.monkey_patch(benchmark_cnn,
+                                  log_fn=test_util.print_and_add_to_list(logs)):
+        bench_cnn.run()
+      training_outputs = test_util.get_training_outputs_from_logs(
+          logs, print_training_accuracy=False)
+      # We should stop after the second evaluation, and we evaluate every 2
+      # steps. So there should be 2 * 2 = 4 training outputs.
+      self.assertEqual(len(training_outputs), 4)
+      self.assertEqual(mock_eval_once.call_count, 2)
+
+  def testOutOfRangeErrorsAreNotIgnored(self):
+    error_msg = 'Fake OutOfRangeError error message'
+    with mock.patch.object(benchmark_cnn.BenchmarkCNN, 'benchmark_with_session',
+                           side_effect=tf.errors.OutOfRangeError(None, None,
+                                                                 error_msg)):
+      with self.assertRaisesRegex(RuntimeError, error_msg):
+        benchmark_cnn.BenchmarkCNN(benchmark_cnn.make_params()).run()
+
+  def testInvalidFlags(self):
+    params = benchmark_cnn.make_params(device='cpu', data_format='NCHW')
+    with self.assertRaises(ValueError):
+      benchmark_cnn.BenchmarkCNN(params)
+
+    params = benchmark_cnn.make_params(use_fp16=True, fp16_vars=True,
+                                       variable_update='replicated',
+                                       all_reduce_spec='nccl')
+    with self.assertRaises(ValueError):
+      benchmark_cnn.BenchmarkCNN(params)
+
+    # Automatic loss scaling is only supported for 'replicated', 'ps',
+    # and 'independent' variable_updates.
+    invalid_variable_updates = [
+        'distributed_replicated', 'distributed_all_reduce'
+    ]
+    for variable_update in invalid_variable_updates:
+      params = benchmark_cnn.make_params(
+          use_fp16=True,
+          fp16_vars=True,
+          fp16_enable_auto_loss_scale=True,
+          variable_update=variable_update)
+      with self.assertRaises(ValueError):
+        benchmark_cnn.BenchmarkCNN(params)
+
+    # Automatic loss scaling is not supported for 'nccl'.
+    params = benchmark_cnn.make_params(
+        use_fp16=True,
+        fp16_vars=True,
+        fp16_enable_auto_loss_scale=True,
+        all_reduce_spec='nccl')
+    with self.assertRaises(ValueError):
+      benchmark_cnn.BenchmarkCNN(params)
+
+    # Automatic loss scaling is not supported for 'staged_vars'.
+    params = benchmark_cnn.make_params(
+        use_fp16=True,
+        fp16_vars=True,
+        fp16_enable_auto_loss_scale=True,
+        staged_vars=True)
+    with self.assertRaises(ValueError):
+      benchmark_cnn.BenchmarkCNN(params)
+
+  def testMakeParams(self):
+    default_params = benchmark_cnn.make_params()
+    self.assertEqual(default_params.model,
+                     flags.param_specs['model'].default_value)
+    params = benchmark_cnn.make_params(model='foo')
+    self.assertEqual(params.model, 'foo')
+    with self.assertRaises(ValueError):
+      benchmark_cnn.make_params(job_name='foo')
+    with self.assertRaises(ValueError):
+      benchmark_cnn.make_params(gpu_memory_frac_for_testing=-1.)
+
+
+class VariableUpdateTest(tf.test.TestCase):
+  """Tests that variables are updated correctly.
+
+  These tests use a very simple deterministic model. For example, some tests use
+  the model
+
+    loss = image * A * B
+
+  where image is a 1x1 images (with a single scalar value), and A and B are
+  scalar variables. Tests will run tf_cnn_benchmarks with such a model, on a
+  sequence of scalar images, and assert that the losses are the correct value.
+  Since the losses depend on the variables, this indirectly tests variables are
+  updated correctly.
+  """
+
+  def setUp(self):
+    super(VariableUpdateTest, self).setUp()
+    _check_has_gpu()
+    benchmark_cnn.setup(benchmark_cnn.make_params())
+
+  def _get_benchmark_cnn_losses(self, inputs, params):
+    """Returns the losses of BenchmarkCNN on the given inputs and params."""
+    logs = []
+    model = test_util.TestCNNModel()
+    with test_util.monkey_patch(benchmark_cnn,
+                                log_fn=test_util.print_and_add_to_list(logs),
+                                LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15):
+      bench = benchmark_cnn.BenchmarkCNN(
+          params, dataset=test_util.TestDataSet(), model=model)
+      # The test model does not use labels when computing loss, so the label
+      # values do not matter as long as it's the right shape.
+      labels = np.array([1] * inputs.shape[0])
+      bench.input_preprocessor.set_fake_data(inputs, labels)
+      if bench.eval_input_preprocessor:
+        bench.eval_input_preprocessor.set_fake_data(inputs, labels)
+      bench.run()
+
+    outputs = test_util.get_training_outputs_from_logs(
+        logs, params.print_training_accuracy)
+    return [x.loss for x in outputs]
+
+  def _test_variable_update(self, params):
+    """Tests variables are updated correctly when the given params are used.
+
+    A BenchmarkCNN is created with a TestCNNModel, and is run with some scalar
+    images. The losses are then compared with the losses obtained with
+    TestCNNModel().manually_compute_losses()
+
+    Args:
+      params: a Params tuple used to create BenchmarkCNN.
+    """
+    inputs = test_util.get_fake_var_update_inputs()
+    actual_losses = self._get_benchmark_cnn_losses(inputs, params)
+    expected_losses, = test_util.TestCNNModel().manually_compute_losses(
+        inputs, 1, params)
+    rtol = 3e-2 if params.use_fp16 else 1e-5
+    self.assertAllClose(actual_losses[:len(expected_losses)], expected_losses,
+                        rtol=rtol, atol=0.)
+
+  def _test_variable_updates(self, params,
+                             var_updates=('parameter_server', 'replicated')):
+    for var_update in var_updates:
+      self._test_variable_update(params._replace(variable_update=var_update))
+
+  def testDefault(self):
+    params = test_util.get_var_update_params()
+    self._test_variable_updates(params)
+
+  # For some reason, this test doesn't always pass
+
+  # def testCpuAsDevice(self):
+  #   params = test_util.get_var_update_params()._replace(
+  #       device='cpu',
+  #       data_format='NHWC')  # NHWC required when --device=cpu
+  #   self._test_variable_updates(params)
+
+  def testCpuAsLocalParamDevice(self):
+    params = test_util.get_var_update_params()._replace(
+        local_parameter_device='cpu')
+    self._test_variable_updates(params)
+
+  def testFp16(self):
+    params = test_util.get_var_update_params()._replace(use_fp16=True)
+    self._test_variable_updates(params)
+
+  def testMomentum(self):
+    params = test_util.get_var_update_params()._replace(optimizer='momentum')
+    self._test_variable_updates(params)
+
+  def testRmsprop(self):
+    params = test_util.get_var_update_params()._replace(optimizer='rmsprop')
+    self._test_variable_updates(params)
+
+  def testNoLayers(self):
+    params = test_util.get_var_update_params()._replace(use_tf_layers=False)
+    self._test_variable_updates(params)
+
+  def testVariousAllReduceSpecs(self):
+    # We do not test xring, because it requires all Variables to have at least
+    # two elements.
+    params = test_util.get_var_update_params()._replace(all_reduce_spec='pscpu')
+    self._test_variable_updates(params, var_updates=('replicated',))
+    params = params._replace(all_reduce_spec='psgpu')
+    self._test_variable_updates(params, var_updates=('replicated',))
+    # TODO(b/80125832): Enable nccl in tests
+    # params = params._replace(all_reduce_spec='nccl',
+    #                          compact_gradient_transfer=False)
+    # self._test_variable_updates(params, var_updates=('replicated',))
+
+  def testPrintBaseLoss(self):
+    params = test_util.get_var_update_params()._replace(
+        loss_type_to_report='base_loss')
+    self._test_variable_updates(params)
+
+  def testSingleL2LossOp(self):
+    params = test_util.get_var_update_params()._replace(
+        single_l2_loss_op=True)
+    self._test_variable_updates(params)
+
+  def testResourceVars(self):
+    params = test_util.get_var_update_params()._replace(
+        use_resource_vars=True)
+    self._test_variable_updates(params)
+
+  def testEvalDuringTrainingEveryNSteps(self):
+    # TODO(reedwm): Test that the eval results are correct. This only tests that
+    # training results are correct.
+    params = test_util.get_var_update_params()._replace(
+        eval_during_training_every_n_steps=1)
+    self._test_variable_updates(params, var_updates=('replicated',))
+
+
+class VariableMgrLocalReplicatedTest(tf.test.TestCase):
+
+  def _test_grad_aggregation_with_var_mgr(self, variable_mgr, num_towers,
+                                          num_vars, deferred_grads):
+    tower_devices = ['/gpu:%d' % i for i in range(num_towers)]
+    tower_grads = []
+    expected_sums = [0.] * num_vars
+    for i, tower_device in enumerate(tower_devices):
+      with tf.device(tower_device):
+        grad_vars = []
+        for j in range(num_vars):
+          n = num_towers * i + j
+          grad_vars.append((tf.constant(n, dtype=tf.float32),
+                            tf.Variable(n, dtype=tf.float32)))
+          expected_sums[j] += n
+      tower_grads.append(grad_vars)
+
+    _, agg_device_grads = variable_mgr.preprocess_device_grads(
+        tower_grads)
+    expected_device_grads = []
+    for i in range(num_towers):
+      expected_grad_vars = []
+      for j in range(num_vars):
+        expected_grad_and_var = [expected_sums[j], num_towers * i + j]
+        if isinstance(agg_device_grads[i][j], tuple):
+          # agg_device_grads[i][j] can be a list or tuple.
+          expected_grad_and_var = tuple(expected_grad_and_var)
+        expected_grad_vars.append(expected_grad_and_var)
+      if isinstance(agg_device_grads[i], tuple):
+        # agg_device_grads[i] can be a list or tuple.
+        expected_grad_vars = tuple(expected_grad_vars)
+      expected_device_grads.append(expected_grad_vars)
+    config = tf.ConfigProto(allow_soft_placement=True)
+    with tf.Session(config=config) as sess:
+      sess.run(tf.initialize_all_variables())
+      sess.run(variable_mgr._warmup_ops)
+      if deferred_grads:
+        # With deferred grads, the result of a session run is always the summed
+        # gradients from the previous session run.
+        sess.run(agg_device_grads)
+        feed_dict = {g: 0 for grad_vars in tower_grads for g, _ in grad_vars}
+        agg_device_grads_ = sess.run(agg_device_grads, feed_dict)
+      else:
+        agg_device_grads_ = sess.run(agg_device_grads)
+    self.assertEqual(agg_device_grads_, expected_device_grads)
+
+  def _test_grad_aggregation(self, params, num_vars):
+    bench = benchmark_cnn.BenchmarkCNN(params)
+    deferred_grads = (params.variable_consistency == 'relaxed')
+    self._test_grad_aggregation_with_var_mgr(bench.variable_mgr, bench.num_gpus,
+                                             num_vars, deferred_grads)
+
+  def test_grad_aggregation(self):
+    base_params = benchmark_cnn.make_params(num_gpus=10,
+                                            variable_update='replicated',
+                                            use_fp16=True)
+    params = base_params
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(gradient_repacking=3)
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(variable_consistency='relaxed')
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(compact_gradient_transfer=False)
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(gradient_repacking=3,
+                                  variable_consistency='relaxed')
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(gradient_repacking=3,
+                                  compact_gradient_transfer=False)
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(variable_consistency='relaxed',
+                                  compact_gradient_transfer=False)
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(gradient_repacking=3,
+                                  variable_consistency='relaxed',
+                                  compact_gradient_transfer=False)
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(num_gpus=8, hierarchical_copy=True)
+    self._test_grad_aggregation(params, 10)
+    # TODO(b/80125832): Enable nccl in tests
+    # params = base_params._replace(all_reduce_spec='nccl',
+    #                               compact_gradient_transfer=False,
+    #                               # For some reason, this test freezes when
+    #                               # num_gpus=10
+    #                               num_gpus=8)
+    # self._test_grad_aggregation(params, 10)
+    params = base_params._replace(all_reduce_spec='pscpu')
+    self._test_grad_aggregation(params, 10)
+
+    params = base_params._replace(num_gpus=8,
+                                  gradient_repacking=3,
+                                  variable_consistency='relaxed',
+                                  hierarchical_copy=True)
+    self._test_grad_aggregation(params, 10)
+    # TODO(b/80125832): Enable nccl in tests
+    # params = base_params._replace(num_gpus=8,
+    #                               gradient_repacking=3,
+    #                               variable_consistency='relaxed',
+    #                               all_reduce_spec='nccl',
+    #                               compact_gradient_transfer=False)
+    # self._test_grad_aggregation(params, 10)
+    params = base_params._replace(gradient_repacking=3,
+                                  variable_consistency='relaxed',
+                                  all_reduce_spec='pscpu')
+    self._test_grad_aggregation(params, 10)
+    params = base_params._replace(gradient_repacking=3,
+                                  variable_consistency='relaxed',
+                                  all_reduce_spec='xring')
+    self._test_grad_aggregation(params, 10)
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/cnn_util.py b/cv/classification/resnet50/tensorflow/cnn_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..09e2fe3501e1c49ce30ea9d2131229bf39ed5707
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/cnn_util.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities for CNN benchmarks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import threading
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+
+def tensorflow_version_tuple():
+  v = tf.__version__
+  major, minor, patch = v.split('.')
+  return (int(major), int(minor), patch)
+
+
+def tensorflow_version():
+  vt = tensorflow_version_tuple()
+  return vt[0] * 1000 + vt[1]
+
+
+def log_fn(log):
+  print(log)
+
+
+def roll_numpy_batches(array, batch_size, shift_ratio):
+  """Moves a proportion of batches from start to the end of the array.
+
+  This function moves a proportion of batches, specified by `shift_ratio`, from
+  the starts of the array to the end. The number of batches moved is rounded
+  down to the nearest integer. For example,
+
+  ```
+  roll_numpy_batches([1, 2, 3, 4, 5, 6], 2, 0.34) == [3, 4, 5, 6, 1, 2]
+  ```
+
+  Args:
+    array: A Numpy array whose first dimension is the batch dimension.
+    batch_size: The batch size.
+    shift_ratio: Proportion of batches to move from the start of the array to
+      the end of the array.
+  Returns:
+    A new Numpy array, with a proportion of the batches at the start of `array`
+    moved to the end.
+  """
+  num_items = array.shape[0]
+  assert num_items % batch_size == 0
+  num_batches = num_items // batch_size
+  starting_batch = int(num_batches * shift_ratio)
+  starting_item = starting_batch * batch_size
+  return np.roll(array, -starting_item, axis=0)
+
+
+# For Python 2.7 compatibility, we do not use threading.Barrier.
+class Barrier(object):
+  """Implements a lightweight Barrier.
+
+  Useful for synchronizing a fixed number of threads at known synchronization
+  points.  Threads block on 'wait()' and simultaneously return once they have
+  all made that call.
+
+  # Implementation adopted from boost/thread/barrier.hpp
+  """
+
+  def __init__(self, parties):
+    """Create a barrier, initialised to 'parties' threads."""
+    self.cond = threading.Condition(threading.Lock())
+    self.parties = parties
+    # Indicates the number of waiting parties.
+    self.waiting = 0
+    # generation is needed to deal with spurious wakeups. If self.cond.wait()
+    # wakes up for other reasons, generation will force it go back to wait().
+    self.generation = 0
+    self.broken = False
+
+  def wait(self):
+    """Wait for the barrier."""
+    with self.cond:
+      # Check if the barrier has been disabled or not.
+      if self.broken:
+        return
+      gen = self.generation
+      self.waiting += 1
+      if self.waiting == self.parties:
+        self.waiting = 0
+        self.generation += 1
+        self.cond.notify_all()
+      # loop because of spurious wakeups
+      while gen == self.generation:
+        self.cond.wait()
+
+  # TODO(huangyp): Remove this method once we find a way to know which step
+  # is the last barrier.
+  def abort(self):
+    """Clear existing barrier and disable this barrier."""
+    with self.cond:
+      if self.waiting > 0:
+        self.generation += 1
+        self.cond.notify_all()
+      self.broken = True
+
+
+class ImageProducer(object):
+  """An image producer that puts images into a staging area periodically.
+
+  This class is useful for periodically running a set of ops, `put_ops` on a
+  different thread every `batch_group_size` steps.
+
+  The notify_image_consumption() method is used to increment an internal counter
+  so that every `batch_group_size` times it is called, `put_ops` is executed. A
+  barrier is placed so that notify_image_consumption() will block until
+  the previous call to `put_ops` has been executed.
+
+  The start() method is used to start the thread that runs `put_ops`.
+
+  The done() method waits until the last put_ops is executed and stops the
+  thread.
+
+  The purpose of this class is to fill an image input pipeline every
+  `batch_group_size` steps. Suppose `put_ops` supplies `batch_group_size` images
+  to the input pipeline when run, and that every step, 1 batch of images is
+  consumed. Then, by calling notify_image_consumption() every step, images are
+  supplied to the input pipeline at the same amount they are consumed.
+
+  Example usage:
+  ```
+  put_ops = ... # Enqueues `batch_group_size` batches to a StagingArea
+  get_op = ...  # Dequeues 1 batch, and does some operations on it
+  batch_group_size = 4
+  with tf.Session() as sess:
+    image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size)
+    image_producer.start()
+    for _ in range(100):
+      sess.run(get_op)
+      image_producer.notify_image_consumption()
+  ```
+  """
+
+  def __init__(self, sess, put_ops, batch_group_size, use_python32_barrier):
+    self.sess = sess
+    self.num_gets = 0
+    self.put_ops = put_ops
+    self.batch_group_size = batch_group_size
+    self.done_event = threading.Event()
+    if (use_python32_barrier and
+        sys.version_info[0] == 3 and sys.version_info[1] >= 2):
+      self.put_barrier = threading.Barrier(2)
+    else:
+      self.put_barrier = Barrier(2)
+
+  def _should_put(self):
+    return (self.num_gets + 1) % self.batch_group_size == 0
+
+  def done(self):
+    """Stop the image producer."""
+    self.done_event.set()
+    self.put_barrier.abort()
+    self.thread.join()
+
+  def start(self):
+    """Start the image producer."""
+    self.sess.run([self.put_ops])
+    self.thread = threading.Thread(target=self._loop_producer)
+    # Set daemon to true to allow Ctrl + C to terminate all threads.
+    self.thread.daemon = True
+    self.thread.start()
+
+  def notify_image_consumption(self):
+    """Increment the counter of image_producer by 1.
+
+    This should only be called by the main thread that consumes images and runs
+    the model computation. One batch of images should be consumed between
+    calling start() and the first call to this method. Then, one batch of images
+    should be consumed between any two successive calls to this method.
+    """
+    if self._should_put():
+      self.put_barrier.wait()
+    self.num_gets += 1
+
+  def _loop_producer(self):
+    while not self.done_event.isSet():
+      self.sess.run([self.put_ops])
+      self.put_barrier.wait()
+
+
+class BaseClusterManager(object):
+  """The manager for the cluster of servers running the benchmark."""
+
+  def __init__(self, params):
+    worker_hosts = params.worker_hosts.split(',')
+    ps_hosts = params.ps_hosts.split(',') if params.ps_hosts else []
+    cluster = {'worker': worker_hosts}
+    if ps_hosts:
+      cluster['ps'] = ps_hosts
+    self._cluster_spec = tf.train.ClusterSpec(cluster)
+
+  def get_target(self):
+    """Returns a target to be passed to tf.Session()."""
+    raise NotImplementedError('get_target must be implemented by subclass')
+
+  def join_server(self):
+    raise NotImplementedError('join must be implemented by subclass')
+
+  def get_cluster_spec(self):
+    return self._cluster_spec
+
+  def num_workers(self):
+    return len(self._cluster_spec.job_tasks('worker'))
+
+  def num_ps(self):
+    if 'ps' in self._cluster_spec.jobs:
+      return len(self._cluster_spec.job_tasks('ps'))
+    else:
+      return 0
+
+
+class GrpcClusterManager(BaseClusterManager):
+  """A cluster manager for a cluster networked with gRPC."""
+
+  def __init__(self, params, config_proto):
+    super(GrpcClusterManager, self).__init__(params)
+    if params.job_name == 'controller':
+      self._target = 'grpc://%s' % self._cluster_spec.job_tasks('worker')[0]
+    else:
+      self._server = tf.train.Server(self._cluster_spec,
+                                     job_name=params.job_name,
+                                     task_index=params.task_index,
+                                     config=config_proto,
+                                     protocol=params.server_protocol)
+      self._target = self._server.target
+
+  def get_target(self):
+    return self._target
+
+  def join_server(self):
+    return self._server.join()
diff --git a/cv/classification/resnet50/tensorflow/cnn_util_test.py b/cv/classification/resnet50/tensorflow/cnn_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c245afbf8de9d72f8b9287e5a104f1ffd42bde8
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/cnn_util_test.py
@@ -0,0 +1,129 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tf_cnn_benchmarks.cnn_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import time
+
+import tensorflow.compat.v1 as tf
+
+import cnn_util
+
+
+class CnnUtilBarrierTest(tf.test.TestCase):
+
+  def testBarrier(self):
+    num_tasks = 20
+    num_waits = 4
+    barrier = cnn_util.Barrier(num_tasks)
+    threads = []
+    sync_matrix = []
+    for i in range(num_tasks):
+      sync_times = [0] * num_waits
+      thread = threading.Thread(
+          target=self._run_task, args=(barrier, sync_times))
+      thread.start()
+      threads.append(thread)
+      sync_matrix.append(sync_times)
+    for thread in threads:
+      thread.join()
+    for wait_index in range(num_waits - 1):
+      # Max of times at iteration i < min of times at iteration i + 1
+      self.assertLessEqual(
+          max([sync_matrix[i][wait_index] for i in range(num_tasks)]),
+          min([sync_matrix[i][wait_index + 1] for i in range(num_tasks)]))
+
+  def _run_task(self, barrier, sync_times):
+    for wait_index in range(len(sync_times)):
+      sync_times[wait_index] = time.time()
+      barrier.wait()
+
+  def testBarrierAbort(self):
+    num_tasks = 2
+    num_waits = 1
+    sync_times = [0] * num_waits
+    barrier = cnn_util.Barrier(num_tasks)
+    thread = threading.Thread(
+        target=self._run_task, args=(barrier, sync_times))
+    thread.start()
+    barrier.abort()
+    # thread won't be blocked by done barrier.
+    thread.join()
+
+
+class ImageProducerTest(tf.test.TestCase):
+
+  def _slow_tensorflow_op(self):
+    """Returns a TensorFlow op that takes approximately 0.1s to complete."""
+    def slow_func(v):
+      time.sleep(0.1)
+      return v
+    return tf.py_func(slow_func, [tf.constant(0.)], tf.float32).op
+
+  def _test_image_producer(self, batch_group_size, put_slower_than_get):
+    # We use the variable x to simulate a staging area of images. x represents
+    # the number of batches in the staging area.
+    x = tf.Variable(0, dtype=tf.int32)
+    if put_slower_than_get:
+      put_dep = self._slow_tensorflow_op()
+      get_dep = tf.no_op()
+    else:
+      put_dep = tf.no_op()
+      get_dep = self._slow_tensorflow_op()
+    with tf.control_dependencies([put_dep]):
+      put_op = x.assign_add(batch_group_size, use_locking=True)
+    with tf.control_dependencies([get_dep]):
+      get_op = x.assign_sub(1, use_locking=True)
+    with self.test_session() as sess:
+      sess.run(tf.variables_initializer([x]))
+      image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size,
+                                              use_python32_barrier=False)
+      image_producer.start()
+      for _ in range(5 * batch_group_size):
+        sess.run(get_op)
+        # We assert x is nonnegative, to ensure image_producer never causes
+        # an unstage op to block. We assert x is at most 2 * batch_group_size,
+        # to ensure it doesn't use too much memory by storing too many batches
+        # in the staging area.
+        self.assertGreaterEqual(sess.run(x), 0)
+        self.assertLessEqual(sess.run(x), 2 * batch_group_size)
+        image_producer.notify_image_consumption()
+        self.assertGreaterEqual(sess.run(x), 0)
+        self.assertLessEqual(sess.run(x), 2 * batch_group_size)
+
+      image_producer.done()
+      time.sleep(0.1)
+      self.assertGreaterEqual(sess.run(x), 0)
+      self.assertLessEqual(sess.run(x), 2 * batch_group_size)
+
+  def test_image_producer(self):
+    self._test_image_producer(1, False)
+    self._test_image_producer(1, True)
+    self._test_image_producer(2, False)
+    self._test_image_producer(2, True)
+    self._test_image_producer(3, False)
+    self._test_image_producer(3, True)
+    self._test_image_producer(8, False)
+    self._test_image_producer(8, True)
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/coco_metric.py b/cv/classification/resnet50/tensorflow/coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8ba67da47c79da96ec3d96feae91169cac7509c
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/coco_metric.py
@@ -0,0 +1,198 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""COCO-style evaluation metrics.
+
+Forked from reference model implementation.
+
+COCO API: github.com/cocodataset/cocoapi/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import atexit
+import tempfile
+
+from absl import flags
+
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import six
+
+import tensorflow.compat.v1 as tf
+
+import mlperf
+import ssd_constants
+
+FLAGS = flags.FLAGS
+
+
+# https://github.com/cocodataset/cocoapi/issues/49
+if six.PY3:
+  import pycocotools.coco
+  pycocotools.coco.unicode = str
+
+
+def async_eval_runner(queue_predictions, queue_results, val_json_file):
+  """Load intermediate eval results and get COCO metrics."""
+  while True:
+    message = queue_predictions.get()
+    if message == 'STOP':  # poison pill
+      break
+    step, predictions = message
+    results = compute_map(predictions, val_json_file)
+    queue_results.put((step, results))
+
+
+def compute_map(predictions, val_json_file):
+  """Use model predictions to compute mAP.
+
+  Args:
+    predictions: a list of tuples returned by decoded_predictions function,
+      each containing the following elements:
+      image source_id, box coordinates in XYWH order, probability score, label
+    val_json_file: path to COCO annotation file
+  Returns:
+    A dictionary that maps all COCO metrics (keys) to their values
+  """
+
+  if val_json_file.startswith("gs://"):
+    _, local_val_json = tempfile.mkstemp(suffix=".json")
+    tf.gfile.Remove(local_val_json)
+
+    tf.gfile.Copy(val_json_file, local_val_json)
+    atexit.register(tf.gfile.Remove, local_val_json)
+  else:
+    local_val_json = val_json_file
+
+  cocoGt = COCO(local_val_json)
+  cocoDt = cocoGt.loadRes(np.array(predictions))
+  E = COCOeval(cocoGt, cocoDt, iouType='bbox')
+  E.evaluate()
+  E.accumulate()
+  E.summarize()
+  print("Current AP: {:.5f}".format(E.stats[0]))
+  metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
+                  'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
+
+  # Prefix with "COCO" to group in TensorBoard.
+  return {"COCO/" + key: value for key, value in zip(metric_names, E.stats)}
+
+
+def calc_iou(target, candidates):
+  target_tiled = np.tile(target[np.newaxis, :], (candidates.shape[0], 1))
+  # Left Top & Right Bottom
+  lt = np.maximum(target_tiled[:,:2], candidates[:,:2])
+
+  rb = np.minimum(target_tiled[:,2:], candidates[:,2:])
+
+  delta = np.maximum(rb - lt, 0)
+
+  intersect = delta[:,0] * delta[:,1]
+
+  delta1 = target_tiled[:,2:] - candidates[:,:2]
+  area1 = delta1[:,0] * delta1[:,1]
+  delta2 = target_tiled[:,2:] - candidates[:,:2]
+  area2 = delta2[:,0] * delta2[:,1]
+
+  iou = intersect/(area1 + area2 - intersect)
+  return iou
+
+
+# TODO(haoyuzhang): Rewrite this NumPy based implementation to TensorFlow based
+# implementation under ssd_model.py accuracy_function.
+def decode_predictions(labels_and_predictions):
+  """Decode predictions and remove unused boxes and labels."""
+  predictions = []
+  for example in labels_and_predictions:
+    source_id = int(example[ssd_constants.SOURCE_ID])
+    pred_box = example[ssd_constants.PRED_BOXES]
+    pred_scores = example[ssd_constants.PRED_SCORES]
+
+    locs, labels, probs = decode_single(
+        pred_box, pred_scores, ssd_constants.OVERLAP_CRITERIA,
+        ssd_constants.MAX_NUM_EVAL_BOXES, ssd_constants.MAX_NUM_EVAL_BOXES)
+
+    raw_height, raw_width, _ = example[ssd_constants.RAW_SHAPE]
+    for loc, label, prob in zip(locs, labels, probs):
+      # Ordering convention differs, hence [1], [0] rather than [0], [1]
+      x, y = loc[1] * raw_width, loc[0] * raw_height
+      w, h = (loc[3] - loc[1]) * raw_width, (loc[2] - loc[0]) * raw_height
+      predictions.append(
+          [source_id, x, y, w, h, prob, ssd_constants.CLASS_INV_MAP[label]])
+  mlperf.logger.log(key=mlperf.tags.NMS_THRESHOLD,
+                    value=ssd_constants.OVERLAP_CRITERIA)
+  mlperf.logger.log(key=mlperf.tags.NMS_MAX_DETECTIONS,
+                    value=ssd_constants.MAX_NUM_EVAL_BOXES)
+  return predictions
+
+
+def decode_single(bboxes_in, scores_in, criteria, max_output, max_num=200):
+  # Reference to https://github.com/amdegroot/ssd.pytorch
+
+  bboxes_out = []
+  scores_out = []
+  labels_out = []
+
+  for i, score in enumerate(np.split(scores_in, scores_in.shape[1], 1)):
+    score = np.squeeze(score, 1)
+
+    # skip background
+    if i == 0:
+      continue
+
+    mask = score > ssd_constants.MIN_SCORE
+    if not np.any(mask):
+      continue
+
+    bboxes, score = bboxes_in[mask, :], score[mask]
+
+    score_idx_sorted = np.argsort(score)
+    score_sorted = score[score_idx_sorted]
+
+    score_idx_sorted = score_idx_sorted[-max_num:]
+    candidates = []
+
+    # perform non-maximum suppression
+    while len(score_idx_sorted):
+      idx = score_idx_sorted[-1]
+      bboxes_sorted = bboxes[score_idx_sorted, :]
+      bboxes_idx = bboxes[idx, :]
+      iou = calc_iou(bboxes_idx, bboxes_sorted)
+
+      score_idx_sorted = score_idx_sorted[iou < criteria]
+      candidates.append(idx)
+
+    bboxes_out.append(bboxes[candidates, :])
+    scores_out.append(score[candidates])
+    labels_out.extend([i]*len(candidates))
+
+  if len(scores_out) == 0:
+    tf.logging.info("No objects detected. Returning dummy values.")
+    return (
+        np.zeros(shape=(1, 4), dtype=np.float32),
+        np.zeros(shape=(1,), dtype=np.int32),
+        np.ones(shape=(1,), dtype=np.float32) * ssd_constants.DUMMY_SCORE,
+    )
+
+  bboxes_out = np.concatenate(bboxes_out, axis=0)
+  scores_out = np.concatenate(scores_out, axis=0)
+  labels_out = np.array(labels_out)
+
+  max_ids = np.argsort(scores_out)[-max_output:]
+
+  return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
diff --git a/cv/classification/resnet50/tensorflow/constants.py b/cv/classification/resnet50/tensorflow/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbb32271bb2669e0ba12588d87d39f7c8924b161
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/constants.py
@@ -0,0 +1,67 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constants used in tf_cnn_benchmarks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from enum import Enum
+
+# Results fetched with this prefix will not be reduced. Instead, they will be
+# passed as matrices to model's postprocess function.
+UNREDUCED_ACCURACY_OP_PREFIX = "tensor:"
+
+# Eval result values with this name prefix will be included in summary.
+SIMPLE_VALUE_RESULT_PREFIX = "simple_value:"
+
+
+class BenchmarkMode(object):
+  """Benchmark running mode."""
+  TRAIN = "training"
+  EVAL = "evaluation"
+  TRAIN_AND_EVAL = "training + evaluation"
+  FORWARD_ONLY = "forward only"
+
+
+class NetworkTopology(str, Enum):
+  """Network topology describes how multiple GPUs are inter-connected.
+  """
+  # DGX-1 uses hybrid cube mesh topology with the following device peer to peer
+  # matrix:
+  # DMA: 0 1 2 3 4 5 6 7
+  # 0:   Y Y Y Y Y N N N
+  # 1:   Y Y Y Y N Y N N
+  # 2:   Y Y Y Y N N Y N
+  # 3:   Y Y Y Y N N N Y
+  # 4:   Y N N N Y Y Y Y
+  # 5:   N Y N N Y Y Y Y
+  # 6:   N N Y N Y Y Y Y
+  # 7:   N N N Y Y Y Y Y
+  DGX1 = "dgx1"
+
+  # V100 in GCP are connected with the following device peer to peer matrix.
+  # In this topology, bandwidth of the connection depends on if it uses NVLink
+  # or PCIe link.
+  # DMA: 0 1 2 3 4 5 6 7
+  # 0:   Y Y Y Y N Y N N
+  # 1:   Y Y Y Y N N N N
+  # 2:   Y Y Y Y N N N Y
+  # 3:   Y Y Y Y N N N N
+  # 4:   N N N N Y Y Y Y
+  # 5:   Y N N N Y Y Y Y
+  # 6:   N N N N Y Y Y Y
+  # 7:   N N Y N Y Y Y Y
+  GCP_V100 = "gcp_v100"
diff --git a/cv/classification/resnet50/tensorflow/convnet_builder.py b/cv/classification/resnet50/tensorflow/convnet_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9903de9247e7401b2982bb061fb6f4bdce7be179
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/convnet_builder.py
@@ -0,0 +1,498 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CNN builder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+import contextlib
+
+import numpy as np
+
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+import mlperf
+from tensorflow.python.layers import convolutional as conv_layers
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.layers import normalization as normalization_layers
+from tensorflow.python.layers import pooling as pooling_layers
+from tensorflow.python.training import moving_averages
+
+
+_data_format_to_channel_axis = {'NCHW': 1, 'NHWC': 3}
+
+
+class ConvNetBuilder(object):
+  """Builder of cnn net."""
+
+  def __init__(self,
+               input_op,
+               input_nchan,
+               phase_train,
+               use_tf_layers,
+               data_format='NCHW',
+               dtype=tf.float32,
+               variable_dtype=tf.float32):
+    self.top_layer = input_op
+    self.top_size = input_nchan
+    self.phase_train = phase_train
+    self.use_tf_layers = use_tf_layers
+    self.data_format = data_format
+    self.dtype = dtype
+    self.variable_dtype = variable_dtype
+    self.counts = defaultdict(lambda: 0)
+    self.use_batch_norm = False
+    self.batch_norm_config = {}  # 'decay': 0.997, 'scale': True}
+    self.channel_pos = ('channels_last'
+                        if data_format == 'NHWC' else 'channels_first')
+    self.aux_top_layer = None
+    self.aux_top_size = 0
+
+  def get_custom_getter(self):
+    """Returns a custom getter that this class's methods must be called under.
+
+    All methods of this class must be called under a variable scope that was
+    passed this custom getter. Example:
+
+    ```python
+    network = ConvNetBuilder(...)
+    with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+      network.conv(...)
+      # Call more methods of network here
+    ```
+
+    Currently, this custom getter only does anything if self.use_tf_layers is
+    True. In that case, it causes variables to be stored as dtype
+    self.variable_type, then casted to the requested dtype, instead of directly
+    storing the variable as the requested dtype.
+    """
+    def inner_custom_getter(getter, *args, **kwargs):
+      """Custom getter that forces variables to have type self.variable_type."""
+      if not self.use_tf_layers:
+        return getter(*args, **kwargs)
+      requested_dtype = kwargs['dtype']
+      if not (requested_dtype == tf.float32 and
+              self.variable_dtype == tf.float16):
+        # Only change the variable dtype if doing so does not decrease variable
+        # precision.
+        kwargs['dtype'] = self.variable_dtype
+      var = getter(*args, **kwargs)
+      # This if statement is needed to guard the cast, because batch norm
+      # assigns directly to the return value of this custom getter. The cast
+      # makes the return value not a variable so it cannot be assigned. Batch
+      # norm variables are always in fp32 so this if statement is never
+      # triggered for them.
+      if var.dtype.base_dtype != requested_dtype:
+        var = tf.cast(var, requested_dtype)
+      return var
+    return inner_custom_getter
+
+  @contextlib.contextmanager
+  def switch_to_aux_top_layer(self):
+    """Context that construct cnn in the auxiliary arm."""
+    if self.aux_top_layer is None:
+      raise RuntimeError('Empty auxiliary top layer in the network.')
+    saved_top_layer = self.top_layer
+    saved_top_size = self.top_size
+    self.top_layer = self.aux_top_layer
+    self.top_size = self.aux_top_size
+    yield
+    self.aux_top_layer = self.top_layer
+    self.aux_top_size = self.top_size
+    self.top_layer = saved_top_layer
+    self.top_size = saved_top_size
+
+  def get_variable(self, name, shape, dtype, cast_dtype, *args, **kwargs):
+    # TODO(reedwm): Currently variables and gradients are transferred to other
+    # devices and machines as type `dtype`, not `cast_dtype`. In particular,
+    # this means in fp16 mode, variables are transferred as fp32 values, not
+    # fp16 values, which uses extra bandwidth.
+    var = tf.get_variable(name, shape, dtype, *args, **kwargs)
+    return tf.cast(var, cast_dtype)
+
+  def _conv2d_impl(self, input_layer, num_channels_in, filters, kernel_size,
+                   strides, padding, kernel_initializer):
+    if self.use_tf_layers:
+      return conv_layers.conv2d(input_layer, filters, kernel_size, strides,
+                                padding, self.channel_pos,
+                                kernel_initializer=kernel_initializer,
+                                use_bias=False)
+    else:
+      weights_shape = [kernel_size[0], kernel_size[1], num_channels_in, filters]
+      # We use the name 'conv2d/kernel' so the variable has the same name as its
+      # tf.layers equivalent. This way, if a checkpoint is written when
+      # self.use_tf_layers == True, it can be loaded when
+      # self.use_tf_layers == False, and vice versa.
+      weights = self.get_variable('conv2d/kernel', weights_shape,
+                                  self.variable_dtype, self.dtype,
+                                  initializer=kernel_initializer)
+      if self.data_format == 'NHWC':
+        strides = [1] + strides + [1]
+      else:
+        strides = [1, 1] + strides
+      return tf.nn.conv2d(input_layer, weights, strides, padding,
+                          data_format=self.data_format)
+
+  def conv(self,
+           num_out_channels,
+           k_height,
+           k_width,
+           d_height=1,
+           d_width=1,
+           mode='SAME',
+           input_layer=None,
+           num_channels_in=None,
+           use_batch_norm=None,
+           stddev=None,
+           activation='relu',
+           bias=0.0,
+           kernel_initializer=None):
+    """Construct a conv2d layer on top of cnn."""
+    if input_layer is None:
+      input_layer = self.top_layer
+    if num_channels_in is None:
+      num_channels_in = self.top_size
+    if stddev is not None and kernel_initializer is None:
+      kernel_initializer = tf.truncated_normal_initializer(stddev=stddev)
+    if kernel_initializer is None:
+      kernel_initializer = tf.variance_scaling_initializer()
+    name = 'conv' + str(self.counts['conv'])
+    self.counts['conv'] += 1
+    with tf.variable_scope(name):
+      strides = [1, d_height, d_width, 1]
+      if self.data_format == 'NCHW':
+        strides = [strides[0], strides[3], strides[1], strides[2]]
+      if mode != 'SAME_RESNET':
+        conv = self._conv2d_impl(input_layer, num_channels_in, num_out_channels,
+                                 kernel_size=[k_height, k_width],
+                                 strides=[d_height, d_width], padding=mode,
+                                 kernel_initializer=kernel_initializer)
+      else:  # Special padding mode for ResNet models
+        if d_height == 1 and d_width == 1:
+          conv = self._conv2d_impl(input_layer, num_channels_in,
+                                   num_out_channels,
+                                   kernel_size=[k_height, k_width],
+                                   strides=[d_height, d_width], padding='SAME',
+                                   kernel_initializer=kernel_initializer)
+        else:
+          rate = 1  # Unused (for 'a trous' convolutions)
+          kernel_height_effective = k_height + (k_height - 1) * (rate - 1)
+          pad_h_beg = (kernel_height_effective - 1) // 2
+          pad_h_end = kernel_height_effective - 1 - pad_h_beg
+          kernel_width_effective = k_width + (k_width - 1) * (rate - 1)
+          pad_w_beg = (kernel_width_effective - 1) // 2
+          pad_w_end = kernel_width_effective - 1 - pad_w_beg
+          padding = [[0, 0], [pad_h_beg, pad_h_end],
+                     [pad_w_beg, pad_w_end], [0, 0]]
+          if self.data_format == 'NCHW':
+            padding = [padding[0], padding[3], padding[1], padding[2]]
+          padded_input_layer = tf.pad(input_layer, padding)
+          conv = self._conv2d_impl(padded_input_layer, num_channels_in,
+                                   num_out_channels,
+                                   kernel_size=[k_height, k_width],
+                                   strides=[d_height, d_width], padding='VALID',
+                                   kernel_initializer=kernel_initializer)
+      if use_batch_norm is None:
+        use_batch_norm = self.use_batch_norm
+      mlperf.logger.log_conv2d(input_tensor=input_layer, output_tensor=conv,
+                               stride_height=d_height, stride_width=d_width,
+                               filters=num_out_channels,
+                               initializer=kernel_initializer,
+                               use_bias=not use_batch_norm and bias is not None)
+      if not use_batch_norm:
+        if bias is not None:
+          biases = self.get_variable('biases', [num_out_channels],
+                                     self.variable_dtype, self.dtype,
+                                     initializer=tf.constant_initializer(bias))
+          biased = tf.reshape(
+              tf.nn.bias_add(conv, biases, data_format=self.data_format),
+              conv.get_shape())
+        else:
+          biased = conv
+      else:
+        self.top_layer = conv
+        self.top_size = num_out_channels
+        biased = self.batch_norm(**self.batch_norm_config)
+      if activation == 'relu':
+        mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
+        conv1 = tf.nn.relu(biased)
+      elif activation == 'linear' or activation is None:
+        conv1 = biased
+      elif activation == 'tanh':
+        conv1 = tf.nn.tanh(biased)
+      else:
+        raise KeyError('Invalid activation type \'%s\'' % activation)
+      self.top_layer = conv1
+      self.top_size = num_out_channels
+      return conv1
+
+  def _pool(self,
+            pool_name,
+            pool_function,
+            k_height,
+            k_width,
+            d_height,
+            d_width,
+            mode,
+            input_layer,
+            num_channels_in):
+    """Construct a pooling layer."""
+    if input_layer is None:
+      input_layer = self.top_layer
+    else:
+      self.top_size = num_channels_in
+    name = pool_name + str(self.counts[pool_name])
+    self.counts[pool_name] += 1
+    if self.use_tf_layers:
+      pool = pool_function(
+          input_layer, [k_height, k_width], [d_height, d_width],
+          padding=mode,
+          data_format=self.channel_pos,
+          name=name)
+    else:
+      if self.data_format == 'NHWC':
+        ksize = [1, k_height, k_width, 1]
+        strides = [1, d_height, d_width, 1]
+      else:
+        ksize = [1, 1, k_height, k_width]
+        strides = [1, 1, d_height, d_width]
+      pool = tf.nn.max_pool(input_layer, ksize, strides, padding=mode,
+                            data_format=self.data_format, name=name)
+    if pool_name == 'mpool':
+      mlperf.logger.log_max_pool(input_tensor=input_layer,
+                                 output_tensor=pool)
+    self.top_layer = pool
+    return pool
+
+  def mpool(self,
+            k_height,
+            k_width,
+            d_height=2,
+            d_width=2,
+            mode='VALID',
+            input_layer=None,
+            num_channels_in=None):
+    """Construct a max pooling layer."""
+    return self._pool('mpool', pooling_layers.max_pooling2d, k_height, k_width,
+                      d_height, d_width, mode, input_layer, num_channels_in)
+
+  def apool(self,
+            k_height,
+            k_width,
+            d_height=2,
+            d_width=2,
+            mode='VALID',
+            input_layer=None,
+            num_channels_in=None):
+    """Construct an average pooling layer."""
+    return self._pool('apool', pooling_layers.average_pooling2d, k_height,
+                      k_width, d_height, d_width, mode, input_layer,
+                      num_channels_in)
+
+  def reshape(self, shape, input_layer=None):
+    if input_layer is None:
+      input_layer = self.top_layer
+    self.top_layer = tf.reshape(input_layer, shape)
+    self.top_size = shape[-1]  # HACK This may not always work
+    return self.top_layer
+
+  def affine(self,
+             num_out_channels,
+             input_layer=None,
+             num_channels_in=None,
+             bias=0.0,
+             stddev=None,
+             activation='relu'):
+    if input_layer is None:
+      input_layer = self.top_layer
+    if num_channels_in is None:
+      num_channels_in = self.top_size
+    name = 'affine' + str(self.counts['affine'])
+    self.counts['affine'] += 1
+    with tf.variable_scope(name):
+      init_factor = 2. if activation == 'relu' else 1.
+      stddev = stddev or np.sqrt(init_factor / num_channels_in)
+      kernel = self.get_variable(
+          'weights', [num_channels_in, num_out_channels],
+          self.variable_dtype, self.dtype,
+          initializer=tf.truncated_normal_initializer(stddev=stddev))
+      biases = self.get_variable('biases', [num_out_channels],
+                                 self.variable_dtype, self.dtype,
+                                 initializer=tf.constant_initializer(bias))
+      mlperf.logger.log(key=mlperf.tags.MODEL_HP_DENSE,
+                        value=num_out_channels)
+      logits = tf.nn.xw_plus_b(input_layer, kernel, biases)
+      if activation == 'relu':
+        mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
+        affine1 = tf.nn.relu(logits, name=name)
+      elif activation == 'linear' or activation is None:
+        affine1 = logits
+      else:
+        raise KeyError('Invalid activation type \'%s\'' % activation)
+      self.top_layer = affine1
+      self.top_size = num_out_channels
+      return affine1
+
+  def inception_module(self, name, cols, input_layer=None, in_size=None):
+    if input_layer is None:
+      input_layer = self.top_layer
+    if in_size is None:
+      in_size = self.top_size
+    name += str(self.counts[name])
+    self.counts[name] += 1
+    with tf.variable_scope(name):
+      col_layers = []
+      col_layer_sizes = []
+      for c, col in enumerate(cols):
+        col_layers.append([])
+        col_layer_sizes.append([])
+        for l, layer in enumerate(col):
+          ltype, args = layer[0], layer[1:]
+          kwargs = {
+              'input_layer': input_layer,
+              'num_channels_in': in_size
+          } if l == 0 else {}
+          if ltype == 'conv':
+            self.conv(*args, **kwargs)
+          elif ltype == 'mpool':
+            self.mpool(*args, **kwargs)
+          elif ltype == 'apool':
+            self.apool(*args, **kwargs)
+          elif ltype == 'share':  # Share matching layer from previous column
+            self.top_layer = col_layers[c - 1][l]
+            self.top_size = col_layer_sizes[c - 1][l]
+          else:
+            raise KeyError(
+                'Invalid layer type for inception module: \'%s\'' % ltype)
+          col_layers[c].append(self.top_layer)
+          col_layer_sizes[c].append(self.top_size)
+      catdim = 3 if self.data_format == 'NHWC' else 1
+      self.top_layer = tf.concat([layers[-1] for layers in col_layers], catdim)
+      self.top_size = sum([sizes[-1] for sizes in col_layer_sizes])
+      return self.top_layer
+
+  def spatial_mean(self, keep_dims=False):
+    name = 'spatial_mean' + str(self.counts['spatial_mean'])
+    self.counts['spatial_mean'] += 1
+    axes = [1, 2] if self.data_format == 'NHWC' else [2, 3]
+    self.top_layer = tf.reduce_mean(
+        self.top_layer, axes, keepdims=keep_dims, name=name)
+    return self.top_layer
+
+  def dropout(self, keep_prob=0.5, input_layer=None):
+    if input_layer is None:
+      input_layer = self.top_layer
+    else:
+      self.top_size = None
+    name = 'dropout' + str(self.counts['dropout'])
+    with tf.variable_scope(name):
+      if not self.phase_train:
+        keep_prob = 1.0
+      if self.use_tf_layers:
+        dropout = core_layers.dropout(input_layer, 1. - keep_prob,
+                                      training=self.phase_train)
+      else:
+        dropout = tf.nn.dropout(input_layer, keep_prob)
+      self.top_layer = dropout
+      return dropout
+
+  def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon):
+    """Batch normalization on `input_layer` without tf.layers."""
+    # We make this function as similar as possible to the
+    # tf.contrib.layers.batch_norm, to minimize the differences between using
+    # layers and not using layers.
+    shape = input_layer.shape
+    num_channels = shape[3] if self.data_format == 'NHWC' else shape[1]
+    beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32,
+                             initializer=tf.zeros_initializer())
+    if use_scale:
+      gamma = self.get_variable('gamma', [num_channels], tf.float32,
+                                tf.float32, initializer=tf.ones_initializer())
+    else:
+      gamma = tf.constant(1.0, tf.float32, [num_channels])
+    # For moving variables, we use tf.get_variable instead of self.get_variable,
+    # since self.get_variable returns the result of tf.cast which we cannot
+    # assign to.
+    moving_mean = tf.get_variable('moving_mean', [num_channels],
+                                  tf.float32,
+                                  initializer=tf.zeros_initializer(),
+                                  trainable=False)
+    moving_variance = tf.get_variable('moving_variance', [num_channels],
+                                      tf.float32,
+                                      initializer=tf.ones_initializer(),
+                                      trainable=False)
+    if self.phase_train:
+      bn, batch_mean, batch_variance = tf.nn.fused_batch_norm(
+          input_layer, gamma, beta, epsilon=epsilon,
+          data_format=self.data_format, is_training=True)
+      mean_update = moving_averages.assign_moving_average(
+          moving_mean, batch_mean, decay=decay, zero_debias=False)
+      variance_update = moving_averages.assign_moving_average(
+          moving_variance, batch_variance, decay=decay, zero_debias=False)
+      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update)
+      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update)
+    else:
+      bn, _, _ = tf.nn.fused_batch_norm(
+          input_layer, gamma, beta, mean=moving_mean,
+          variance=moving_variance, epsilon=epsilon,
+          data_format=self.data_format, is_training=False)
+    return bn
+
+  def batch_norm(self, input_layer=None, decay=0.999, scale=False,
+                 epsilon=0.001):
+    """Adds a Batch Normalization layer."""
+    if input_layer is None:
+      input_layer = self.top_layer
+    else:
+      self.top_size = None
+    name = 'batchnorm' + str(self.counts['batchnorm'])
+    self.counts['batchnorm'] += 1
+
+    center = True
+    with tf.variable_scope(name) as scope:
+      if self.use_tf_layers:
+        layer_obj = normalization_layers.BatchNormalization(
+            momentum=decay,
+            scale=scale,
+            epsilon=epsilon,
+            fused=True,
+            axis=_data_format_to_channel_axis[self.data_format],
+            # We pass this 'scope' argument for compatibility with checkpoints
+            # created with the contrib version of batch norm. tf_cnn_benchmarks
+            # used to use the contrib version.
+            _scope=scope,
+            center=center,
+            name=scope.name)
+        bn = layer_obj.apply(input_layer, training=self.phase_train)
+      else:
+        bn = self._batch_norm_without_layers(input_layer, decay, scale, epsilon)
+    self.top_layer = bn
+    self.top_size = bn.shape[3] if self.data_format == 'NHWC' else bn.shape[1]
+    self.top_size = int(self.top_size)
+    mlperf.logger.log_batch_norm(
+        input_tensor=input_layer, output_tensor=bn, momentum=decay,
+        epsilon=epsilon, center=center, scale=scale, training=self.phase_train)
+    return bn
+
+  def lrn(self, depth_radius, bias, alpha, beta):
+    """Adds a local response normalization layer."""
+    name = 'lrn' + str(self.counts['lrn'])
+    self.counts['lrn'] += 1
+    self.top_layer = tf.nn.lrn(
+        self.top_layer, depth_radius, bias, alpha, beta, name=name)
+    return self.top_layer
diff --git a/cv/classification/resnet50/tensorflow/datasets.py b/cv/classification/resnet50/tensorflow/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..e51743e3d37231256288636ead999b8d23eb3dfe
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/datasets.py
@@ -0,0 +1,272 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark dataset utilities.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from abc import abstractmethod
+import os
+
+import numpy as np
+import six
+from six.moves import cPickle
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+
+from tensorflow.python.platform import gfile
+import preprocessing
+
+IMAGENET_NUM_TRAIN_IMAGES = 1281167
+IMAGENET_NUM_VAL_IMAGES = 50000
+
+IMAGENETTE_NUM_TRAIN_IMAGES = 9469
+IMAGENETTE_NUM_VAL_IMAGES = 3925
+
+COCO_NUM_TRAIN_IMAGES = 118287
+COCO_NUM_VAL_IMAGES = 4952
+
+
+class Dataset(object):
+  """Abstract class for cnn benchmarks dataset."""
+
+  def __init__(self,
+               name,
+               data_dir=None,
+               queue_runner_required=False,
+               num_classes=None):
+    self.name = name
+    self.data_dir = data_dir
+    self._queue_runner_required = queue_runner_required
+    self._num_classes = num_classes
+
+  def tf_record_pattern(self, subset):
+    return os.path.join(self.data_dir, '%s-*-of-*' % subset)
+
+  def reader(self):
+    return tf.TFRecordReader()
+
+  @property
+  def num_classes(self):
+    return self._num_classes
+
+  @num_classes.setter
+  def num_classes(self, val):
+    self._num_classes = val
+
+  @abstractmethod
+  def num_examples_per_epoch(self, subset):
+    pass
+
+  def __str__(self):
+    return self.name
+
+  def get_input_preprocessor(self, input_preprocessor='default'):
+    assert not self.use_synthetic_gpu_inputs()
+    return _SUPPORTED_INPUT_PREPROCESSORS[self.name][input_preprocessor]
+
+  def queue_runner_required(self):
+    return self._queue_runner_required
+
+  def use_synthetic_gpu_inputs(self):
+    return not self.data_dir
+
+
+class LibrispeechDataset(Dataset):
+  """Configuration for LibriSpeech dataset."""
+
+  def __init__(self, data_dir=None):
+    super(LibrispeechDataset, self).__init__(
+        'librispeech', data_dir, num_classes=29)
+
+  def tf_record_pattern(self, subset):
+    if subset == 'train':
+      return os.path.join(self.data_dir, 'train-clean-*.tfrecords')
+    elif subset == 'validation':
+      return os.path.join(self.data_dir, 'test-clean.tfrecords')
+    else:
+      return ''
+
+  def num_examples_per_epoch(self, subset='train'):
+    del subset
+    return 2  # TODO(laigd): currently this is an arbitrary number.
+
+
+class ImageDataset(Dataset):
+  """Abstract class for image datasets."""
+
+  def __init__(self,
+               name,
+               height,
+               width,
+               depth=None,
+               data_dir=None,
+               queue_runner_required=False,
+               num_classes=1001):
+    super(ImageDataset, self).__init__(name, data_dir, queue_runner_required,
+                                       num_classes)
+    self.height = height
+    self.width = width
+    self.depth = depth or 3
+
+
+class ImagenetDataset(ImageDataset):
+  """Configuration for Imagenet dataset."""
+
+  def __init__(self, data_dir=None):
+    super(ImagenetDataset, self).__init__(
+        'imagenet', 300, 300, data_dir=data_dir)
+
+  def num_examples_per_epoch(self, subset='train'):
+    if subset == 'train':
+      return IMAGENET_NUM_TRAIN_IMAGES
+    elif subset == 'validation':
+      return IMAGENET_NUM_VAL_IMAGES
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+class ImagenetteDataset(ImageDataset):
+  """Configuration for Imagenette dataset."""
+  def __init__(self, data_dir=None):
+    super(ImagenetteDataset, self).__init__(
+        'imagenette', 300, 300, data_dir=data_dir, num_classes=10)
+
+  def num_examples_per_epoch(self, subset='train'):
+    if subset == 'train':
+      return IMAGENETTE_NUM_TRAIN_IMAGES
+    elif subset == 'validation':
+      return IMAGENETTE_NUM_VAL_IMAGES
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+class Cifar10Dataset(ImageDataset):
+  """Configuration for cifar 10 dataset.
+
+  It will mount all the input images to memory.
+  """
+
+  def __init__(self, data_dir=None):
+    super(Cifar10Dataset, self).__init__(
+        'cifar10',
+        32,
+        32,
+        data_dir=data_dir,
+        queue_runner_required=True,
+        num_classes=11)
+
+  def read_data_files(self, subset='train'):
+    """Reads from data file and returns images and labels in a numpy array."""
+    assert self.data_dir, ('Cannot call `read_data_files` when using synthetic '
+                           'data')
+    if subset == 'train':
+      filenames = [
+          os.path.join(self.data_dir, 'data_batch_%d' % i)
+          for i in xrange(1, 6)
+      ]
+    elif subset == 'validation':
+      filenames = [os.path.join(self.data_dir, 'test_batch')]
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+    inputs = []
+    for filename in filenames:
+      with gfile.Open(filename, 'rb') as f:
+        # python2 does not have the encoding parameter
+        encoding = {} if six.PY2 else {'encoding': 'bytes'}
+        inputs.append(cPickle.load(f, **encoding))
+    # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
+    # input format.
+    all_images = np.concatenate(
+        [each_input[b'data'] for each_input in inputs]).astype(np.float32)
+    all_labels = np.concatenate(
+        [each_input[b'labels'] for each_input in inputs])
+    return all_images, all_labels
+
+  def num_examples_per_epoch(self, subset='train'):
+    if subset == 'train':
+      return 50000
+    elif subset == 'validation':
+      return 10000
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+
+class COCODataset(ImageDataset):
+  """COnfiguration for COCO dataset."""
+
+  def __init__(self, data_dir=None, image_size=300):
+    super(COCODataset, self).__init__(
+        'coco', image_size, image_size, data_dir=data_dir, num_classes=81)
+
+  def num_examples_per_epoch(self, subset='train'):
+    if subset == 'train':
+      return COCO_NUM_TRAIN_IMAGES
+    elif subset == 'validation':
+      return COCO_NUM_VAL_IMAGES
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+
+_SUPPORTED_DATASETS = {
+    'imagenet': ImagenetDataset,
+    'imagenette' : ImagenetteDataset,
+    'cifar10': Cifar10Dataset,
+    'librispeech': LibrispeechDataset,
+    'coco': COCODataset,
+}
+
+_SUPPORTED_INPUT_PREPROCESSORS = {
+    'imagenet': {
+        'default': preprocessing.RecordInputImagePreprocessor,
+        'official_models_imagenet': preprocessing.ImagenetPreprocessor,
+    },
+    'imagenette': {
+        'default': preprocessing.RecordInputImagePreprocessor,
+        'official_models_imagenet': preprocessing.ImagenetPreprocessor,
+    },
+    'cifar10': {
+        'default': preprocessing.Cifar10ImagePreprocessor
+    },
+    'librispeech': {
+        'default': preprocessing.LibrispeechPreprocessor
+    },
+    'coco': {
+        'default': preprocessing.COCOPreprocessor
+    },
+}
+
+
+def create_dataset(data_dir, data_name):
+  """Create a Dataset instance based on data_dir and data_name."""
+  if not data_dir and not data_name:
+    # When using synthetic data, use synthetic imagenet images by default.
+    data_name = 'imagenet'
+
+  # Infere dataset name from data_dir if data_name is not provided.
+  if data_name is None:
+    for supported_name in _SUPPORTED_DATASETS:
+      if supported_name in data_dir:
+        data_name = supported_name
+        break
+    else:  # Failed to identify dataset name from data dir.
+      raise ValueError('Could not identify name of dataset. '
+                       'Please specify with --data_name option.')
+  if data_name not in _SUPPORTED_DATASETS:
+    raise ValueError('Unknown dataset. Must be one of %s' % ', '.join(
+        [key for key in sorted(_SUPPORTED_DATASETS.keys())]))
+
+  return _SUPPORTED_DATASETS[data_name](data_dir)
diff --git a/cv/classification/resnet50/tensorflow/download_script.sh b/cv/classification/resnet50/tensorflow/download_script.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b51a687020423920f9a7d0cc1b8fc1ddf964b4bd
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/download_script.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -e
+exit 0
\ No newline at end of file
diff --git a/cv/classification/resnet50/tensorflow/flags.py b/cv/classification/resnet50/tensorflow/flags.py
new file mode 100644
index 0000000000000000000000000000000000000000..f65898ae2e68c3d0891dd605b877b78cf108e6c0
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/flags.py
@@ -0,0 +1,93 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains functions to define flags and params.
+
+Calling a DEFINE_* function will add a ParamSpec namedtuple to the param_spec
+dict. The DEFINE_* arguments match those in absl. Calling define_flags() creates
+a command-line flag for every ParamSpec defined by a DEFINE_* functions.
+
+The reason we don't use absl flags directly is that we want to be able to use
+tf_cnn_benchmarks as a library. When using it as a library, we don't want to
+define any flags, but instead pass parameters to the BenchmarkCNN constructor.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+
+from absl import flags as absl_flags
+import six
+
+
+FLAGS = absl_flags.FLAGS
+
+
+# ParamSpec describes one of benchmark_cnn.BenchmarkCNN's parameters.
+ParamSpec = namedtuple('_ParamSpec',
+                       ['flag_type', 'default_value', 'description',
+                        'kwargs'])
+
+
+# Maps from parameter name to its ParamSpec.
+param_specs = {}
+
+
+def DEFINE_string(name, default, help):  # pylint: disable=invalid-name,redefined-builtin
+  param_specs[name] = ParamSpec('string', default, help, {})
+
+
+def DEFINE_boolean(name, default, help):  # pylint: disable=invalid-name,redefined-builtin
+  param_specs[name] = ParamSpec('boolean', default, help, {})
+
+
+def DEFINE_integer(name, default, help, lower_bound=None, upper_bound=None):  # pylint: disable=invalid-name,redefined-builtin
+  kwargs = {'lower_bound': lower_bound, 'upper_bound': upper_bound}
+  param_specs[name] = ParamSpec('integer', default, help, kwargs)
+
+
+def DEFINE_float(name, default, help, lower_bound=None, upper_bound=None):  # pylint: disable=invalid-name,redefined-builtin
+  kwargs = {'lower_bound': lower_bound, 'upper_bound': upper_bound}
+  param_specs[name] = ParamSpec('float', default, help, kwargs)
+
+
+def DEFINE_enum(name, default, enum_values, help):  # pylint: disable=invalid-name,redefined-builtin
+  kwargs = {'enum_values': enum_values}
+  param_specs[name] = ParamSpec('enum', default, help, kwargs)
+
+
+def DEFINE_list(name, default, help):  # pylint: disable=invalid-name,redefined-builtin
+  param_specs[name] = ParamSpec('list', default, help, {})
+
+
+def define_flags(specs=None):
+  """Define a command line flag for each ParamSpec in flags.param_specs."""
+  specs = specs or param_specs
+  define_flag = {
+      'boolean': absl_flags.DEFINE_boolean,
+      'float': absl_flags.DEFINE_float,
+      'integer': absl_flags.DEFINE_integer,
+      'string': absl_flags.DEFINE_string,
+      'enum': absl_flags.DEFINE_enum,
+      'list': absl_flags.DEFINE_list
+  }
+  for name, param_spec in six.iteritems(specs):
+    if param_spec.flag_type not in define_flag:
+      raise ValueError('Unknown flag_type %s' % param_spec.flag_type)
+    else:
+      define_flag[param_spec.flag_type](name, param_spec.default_value,
+                                        help=param_spec.description,
+                                        **param_spec.kwargs)
diff --git a/cv/classification/resnet50/tensorflow/get_imagenette.sh b/cv/classification/resnet50/tensorflow/get_imagenette.sh
new file mode 100644
index 0000000000000000000000000000000000000000..460e635f9ffdcc3a58c5980c7f299440691fd941
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/get_imagenette.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+: ${DATA_DIR:="./"}
+
+
+if [ ! -d "./imagenette" ]; then
+    echo "Make soft link form ${DATA_DIR} to tf_cnn_benckmarks"
+    ln -s "${DATA_DIR}/imagenette_tfrecord" imagenette
+fi
+
diff --git a/cv/classification/resnet50/tensorflow/get_num_devices.sh b/cv/classification/resnet50/tensorflow/get_num_devices.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1637c5a1f95e53979cc9e76a0f2ec7eab9fea564
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/get_num_devices.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+devices=$CUDA_VISIBLE_DEVICES
+if [ -n "$devices"  ]; then
+    _devices=(${devices//,/ })
+    num_devices=${#_devices[@]}
+else
+    num_devices=2
+    export CUDA_VISIBLE_DEVICES=0,1
+    echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}"
+fi
+export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices}
\ No newline at end of file
diff --git a/cv/classification/resnet50/tensorflow/leading_indicators_test.py b/cv/classification/resnet50/tensorflow/leading_indicators_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd8715261afc5e19ca4484fe95c81f6c2330d26
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/leading_indicators_test.py
@@ -0,0 +1,1003 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark various leading indicators CNNs.
+
+The purpose of these tests is to test each model as a high level baseline and
+to ensure the various variable_update options have not regressing. Not all
+options are tested.  The tests focus on the most viable options.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes
+import logging
+import os
+import sys
+
+from absl import flags
+from absl.testing import absltest  # pylint: disable=unused-import
+import tensorflow.compat.v1 as tf  # pylint: disable=g-bad-import-order
+import benchmark_cnn
+from platforms import util as platforms_util
+
+flags.DEFINE_integer('num_batches', None,
+                     'number of batches to run, excluding warmup')
+
+
+class BenchmarkBase(tf.test.Benchmark):
+  """Base class for all benchmarks in this file."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Base class for all benchmarks in this file.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+
+    # Load default values if the benchmark is not run with absl.app.run()
+    if not flags.FLAGS.is_parsed():
+      flags.FLAGS.mark_as_parsed()
+
+    self.fake_data_dir = os.path.join(platforms_util.get_test_data_dir(),
+                                      'fake_tf_record_data')
+    self.output_dir = output_dir
+    if root_data_dir is None:
+      self.data_dir = ('/readahead/200M/placer/prod/home/distbelief/'
+                       'imagenet-tensorflow/imagenet-2012-tfrecord')
+    else:
+      self.data_dir = os.path.join(root_data_dir, 'imagenet')
+
+  def _run_benchmark(self, params):
+    """Run a CNN benchmark and report its results.
+
+    Args:
+      params: Params tuple, typically created by benchmark_cnn.make_params or
+        benchmark_cnn.make_params_from_flags.
+    """
+    logging.info('Running benchmark [%s]', self._get_name())
+    params = benchmark_cnn.setup(params)
+    bench = benchmark_cnn.BenchmarkCNN(params)
+    bench.print_info()
+    stats = bench.run()
+    extras = {}
+    extras['examples_per_sec'] = stats.get('images_per_sec')
+    if 'last_average_loss' in stats:
+      extras['last_average_loss'] = stats['last_average_loss']
+    if 'top_1_accuracy' in stats:
+      extras['top_1_accuracy'] = stats['top_1_accuracy']
+    if 'top_5_accuracy' in stats:
+      extras['top_5_accuracy'] = stats['top_5_accuracy']
+    self.report_benchmark(
+        iters=stats.get('num_steps'),
+        wall_time=stats.get('average_wall_time'),
+        extras=extras)
+
+  def _shared_params(self):
+    """Returns shared parameters for all benchmarks in this file."""
+    params = {}
+    if flags.FLAGS.num_batches is not None:
+      params['num_batches'] = flags.FLAGS.num_batches
+    if self.output_dir is not None:
+      params['benchmark_log_dir'] = self.output_dir
+    return benchmark_cnn.make_params(**params)
+
+  def _binary_search_batch_size(self, params, init_batch_size):
+    """Find the max batch_size using binary search."""
+    assert init_batch_size > 0
+    low_batch_size = 0
+    high_batch_size = None
+    batch_size = init_batch_size
+
+    # No need to run a warmup or many batches; if it doesn't OOM after 10
+    # batches, it should work in general.
+    params = params._replace(num_batches=10, num_warmup_batches=0)
+
+    # Find high_batch_size first.
+    tf.logging.info(
+        'Looking for upper bound to batch size, starting with %d' % batch_size)
+    while high_batch_size is None:
+      tf.logging.info('Trying batch_size %d' % batch_size)
+      params = params._replace(batch_size=batch_size)
+      bench = benchmark_cnn.BenchmarkCNN(params)
+      bench.print_info()
+      try:
+        bench.run()
+        low_batch_size = batch_size
+        batch_size *= 2
+      except tf.errors.ResourceExhaustedError:
+        high_batch_size = batch_size - 1
+
+    # Binary Search
+    tf.logging.info(
+        'Max batch size is in range (%d, %d].  Starting binary search to find '
+        'exact max batch size.' % (low_batch_size, batch_size))
+    while low_batch_size < high_batch_size:
+      batch_size = (low_batch_size + high_batch_size + 1) // 2
+      tf.logging.info('Trying batch_size %d' % batch_size)
+      params = params._replace(batch_size=batch_size)
+      bench = benchmark_cnn.BenchmarkCNN(params)
+      bench.print_info()
+      try:
+        bench.run()
+        low_batch_size = batch_size
+      except tf.errors.ResourceExhaustedError:
+        high_batch_size = batch_size - 1
+    self.report_benchmark(extras={'max_batch_size': low_batch_size})
+
+
+class Resnet50BenchmarksInferenceCpu(BenchmarkBase):
+  """"Benchmarks for ResNet50 inference on CPU."""
+
+  def _shared_params(self):
+    """Returns shared parameters for all ResNet50 benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        num_gpus=1,
+        model='resnet50',
+        num_warmup_batches=5,
+        num_batches=50,
+        distortions=False,
+        forward_only=True,
+        device='cpu',
+        data_format='NHWC',
+        num_intra_threads=0)
+
+  def benchmark_synth_forward_batch1(self):
+    """Tests 1 CPU batch size 1."""
+    params = self._shared_params()._replace(batch_size=1)
+    self._run_benchmark(params)
+
+  def benchmark_synth_forward_batch16(self):
+    """Tests 1 CPU batch size 16."""
+    params = self._shared_params()._replace(batch_size=16)
+    self._run_benchmark(params)
+
+
+class FrozenResnet50BenchmarksInferenceCpu(Resnet50BenchmarksInferenceCpu):
+  """"Benchmarks for ResNet50 frozen graph inference on CPU."""
+
+  def _shared_params(self):
+    return super(FrozenResnet50BenchmarksInferenceCpu,
+                 self)._shared_params()._replace(freeze_when_forward_only=True)
+
+
+class Resnet50BenchmarksInference(BenchmarkBase):
+  """"Benchmarks for ResNet50 inference."""
+
+  def _shared_params(self):
+    """Returns shared parameters for all ResNet50 benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        num_gpus=1, model='resnet50', distortions=False, forward_only=True)
+
+  def benchmark_synth_forward_batch128(self):
+    """Tests 1 GPU batch size 128."""
+    params = self._shared_params()._replace(batch_size=128)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_forward_batch128(self):
+    """Tests 1 GPU batch size 128 FP16."""
+    params = self._shared_params()._replace(batch_size=128, use_fp16=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_forward_batch16(self):
+    """Tests 1 GPU batch size 16 FP16."""
+    params = self._shared_params()._replace(batch_size=16, use_fp16=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_synth_forward_batch128(self):
+    """Tests 1 GPU batch size 128 with XLA."""
+    params = self._shared_params()._replace(batch_size=128, xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_synth_forward_batch128(self):
+    """Tests 1 GPU batch size 128 FP16 with XLA."""
+    params = self._shared_params()._replace(
+        batch_size=128, use_fp16=True, xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_synth_forward_batch16(self):
+    """Tests 1 GPU batch size 16 FP16 with XLA."""
+    params = self._shared_params()._replace(
+        batch_size=16, use_fp16=True, xla=True)
+    self._run_benchmark(params)
+
+
+class FrozenResnet50BenchmarksInference(Resnet50BenchmarksInference):
+  """"Benchmarks for ResNet50 frozen graph inference."""
+
+  def _shared_params(self):
+    return super(FrozenResnet50BenchmarksInference,
+                 self)._shared_params()._replace(freeze_when_forward_only=True)
+
+  def benchmark_trt_synth_forward_batch128(self):
+    """Tests 1 GPU batch size 128."""
+    params = self._shared_params()._replace(batch_size=128, trt_mode='FP32')
+    self._run_benchmark(params)
+
+  # TODO(laigd): enable fp16 tests for TF-TRT, it's currently not supported yet.
+  # def benchmark_fp16_trt_synth_forward_batch128(self):
+  #   """Tests 1 GPU batch size 128 FP16."""
+  #   params = self._shared_params()._replace(
+  #       batch_size=128, use_fp16=True, trt_mode='FP16')
+  #   self._run_benchmark(params)
+
+  # Test with batch size 16 to compare with native TF GPU implementation and
+  # XLA.
+  # def benchmark_fp16_trt_synth_forward_batch16(self):
+  #   """Tests 1 GPU batch size 16 FP16."""
+  #   params = self._shared_params()._replace(
+  #       batch_size=16, use_fp16=True, trt_mode='FP16')
+  #   self._run_benchmark(params)
+
+
+class Resnet50Benchmarks(BenchmarkBase):
+  """"Benchmark resnet50 configurations."""
+
+  def _shared_params(self):
+    """Returns shared parameters for all ResNet50 benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        model='resnet50', batch_size=128, distortions=False,
+        optimizer='momentum')
+
+  def _shared_params_fp16(self):
+    """Returns shared parameters for all ResNet50 FP16 benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        model='resnet50',
+        batch_size=256,
+        distortions=False,
+        use_fp16=True,
+        optimizer='momentum',
+        loss_type_to_report='base_loss',
+        compute_lr_on_cpu=True,
+        single_l2_loss_op=True
+    )
+
+  def benchmark_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data."""
+    params = self._shared_params()._replace(num_gpus=1)
+    self._run_benchmark(params)
+
+  def benchmark_fake_1gpu_gpuparams(self):
+    """Tests 1 gpu with fake data."""
+    params = self._shared_params()._replace(
+        num_gpus=1, data_dir=self.fake_data_dir, data_name='imagenet')
+    self._run_benchmark(params)
+
+  def benchmark_synth_1gpu_max_batch_size(self):
+    """Finds largest batch size that can be run with 1 gpu using synth data."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server')
+    self._binary_search_batch_size(params, init_batch_size=128)
+
+  def benchmark_synth_4gpu_gpureplicated(self):
+    """Tests 4 gpu with synthetic data with parameters replicated."""
+    params = self._shared_params()._replace(
+        num_gpus=4,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  def benchmark_synth_8gpu_gpureplicated(self):
+    """Tests 8 gpu with synthetic data with parameters replicated."""
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  def benchmark_fake_8gpu_gpureplicated(self):
+    """Tests 8 gpu with fake data with parameters replicated."""
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  # FP16 mixed-precision tests.
+
+  def benchmark_fp16_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data with parameters on the gpu."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1, variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_1gpu_gpuparams_batch128(self):
+    """Tests 1 gpu with synthetic data with parameters on the gpu."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1, batch_size=128, variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_4gpu_gpureplicated(self):
+    """Tests 4 gpu with synthetic data with nccl and all_reduce."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=4,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_8gpu_gpureplicated(self):
+    """Tests 8 gpu with synthetic with nccl and all_reduce."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_fake_1gpu_gpuparams(self):
+    """Tests 1 gpus with fake data."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_fp16_fake_8gpu_gpureplicated(self):
+    """Tests 8 gpus with fake data."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_fakedistort_8gpu_gpureplicated(self):
+    """Tests 8 gpus with fake distorted data."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        distortions=True,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  # XLA versions of Resnet50 tests only for single GPU.
+  def benchmark_xla_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data with XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with fp16, synthetic data with XLA."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True)
+    self._run_benchmark(params)
+
+  # Test does not run as part of continuous testing on guitar.
+  def benchmark_ng_xla_batch64_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with XLA, synth data, and batch 64."""
+    params = self._shared_params()._replace(
+        num_gpus=1, batch_size=64, variable_update='parameter_server', xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_batch64_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with fp16, XLA, synth data, and batch 64."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1,
+        batch_size=64,
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_batch128_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with fp16, XLA, and synth data."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1,
+        batch_size=128,
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_synth_1gpu_max_batch_size(self):
+    """Finds largest batch that can be run with XLA, 1 gpu, and synth data."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True)
+    self._binary_search_batch_size(params, init_batch_size=128)
+
+  def benchmark_xla_real_1gpu_gpuparams(self):
+    """Tests 1 gpu with real data with XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.data_dir,
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+  # Test does not run as part of continuous testing.
+  def benchmark_xla_fake_1gpu_gpuparams(self):
+    """Tests 1 gpu with fake data with XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+  # Test does not run as part of continuous testing.
+  def benchmark_xla_fakedistort_1gpu_gpuparams(self):
+    """Tests 1 gpu with fake distorted data with XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        distortions=True,
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+
+class Resnet50v15Benchmarks(BenchmarkBase):
+  """"Benchmark various ResNet50V1.5 configurations.
+
+  ResNetV1.5 differs from V1 in stride 2 is used in the first 3x3 convolution of
+  each block instead of the first 1x1 convolution.
+  """
+
+  def _shared_params_fp16(self):
+    """Returns shared parameters for all ResNet50v1.5 FP16 benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        model='resnet50_v1.5',
+        batch_size=256,
+        distortions=False,
+        use_fp16=True,
+        optimizer='momentum',
+        loss_type_to_report='base_loss',
+        compute_lr_on_cpu=True,
+        single_l2_loss_op=True
+    )
+
+  def benchmark_fp16_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data."""
+    params = self._shared_params_fp16()._replace(num_gpus=1)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_batch256_synth_8gpu_gpuparams(self):
+    """Tests 8 gpus with synthetic data at batch 256."""
+    params = self._shared_params_fp16()._replace(num_gpus=8)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_batch128_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data at batch 128 (useful for small GPUs)."""
+    params = self._shared_params_fp16()._replace(num_gpus=1, batch_size=128)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_fake_1gpu_gpuparams(self):
+    """Tests 1 gpu with fake data."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1, data_dir=self.fake_data_dir, data_name='imagenet')
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_8gpu_gpureplicated(self):
+    """Tests 8 gpu with synthetic data with parameters replicated."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8,
+        num_batches=200,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_fake_8gpu_gpureplicated(self):
+    """Tests 8 gpu with fake data with parameters replicated."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8,
+        num_batches=200,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  # XLA versions of Resnet50v1.5 tests.
+  def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with fp16, synthetic data with XLA."""
+    params = self._shared_params_fp16()._replace(num_gpus=1, xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_batch128_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with fp16, batch128, synthetic data with XLA."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1, batch_size=128, xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_compile_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data."""
+    params = self._shared_params_fp16()._replace(num_gpus=1, xla_compile=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_compile_batch128_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data at batch 128 (useful for small GPUs)."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1, num_batches=200, batch_size=128, xla_compile=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_batch256_synth_8gpu_gpuparams(self):
+    """Tests 8 gpu with synthetic data and xla autojit."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8, num_batches=200, batch_size=256, xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_compile_fake_1gpu_gpuparams(self):
+    """Tests 1 gpu with fake data."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=1,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        xla_compile=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_compile_synth_8gpu_gpureplicated(self):
+    """Tests 8 gpu with synthetic data with parameters replicated."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8,
+        num_batches=200,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2,
+        xla_compile=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_synth_8gpu_gpureplicated(self):
+    """Tests 8 gpu with synthetic data with parameters replicated."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8,
+        num_batches=200,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2,
+        xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_compile_fake_8gpu_gpureplicated(self):
+    """Tests 8 gpu with fake data with parameters replicated."""
+    params = self._shared_params_fp16()._replace(
+        num_gpus=8,
+        num_batches=200,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2,
+        xla_compile=True)
+    self._run_benchmark(params)
+
+
+class Vgg16Benchmarks(BenchmarkBase):
+  """"Benchmark various vgg16 configurations."""
+
+  def _shared_params(self):
+    """Returns shared parameters for all vgg16 benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        model='vgg16', batch_size=128, distortions=False)
+
+  def benchmark_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data with parameters on gpu."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data with parameters on gpu."""
+    params = self._shared_params()._replace(
+        num_gpus=1, use_fp16=True, variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_synth_8gpu_gpureplicated(self):
+    """Tests 8 gpu with synthetic data with parameters replicated."""
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        all_reduce_spec='nccl',
+        variable_update='replicated',
+        compact_gradient_transfer=False,
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  # XLA versions of VGG16 tests only for single GPU.
+  def benchmark_xla_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data and XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with fp16, synthetic data, and XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True)
+    self._run_benchmark(params)
+
+  # Test does not run as part of continuous testing.
+  def benchmark_xla_fake_1gpu_gpuparams(self):
+    """Tests 1 gpu with fake data and XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_real_1gpu_gpuparams(self):
+    """Tests 1 gpu with real data and XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.data_dir,
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+
+class TrivialBenchmarks(BenchmarkBase):
+  """"Benchmarks for trivial model.
+
+  The purpose of these tests is to verify the upper bound for the input
+  pipeline. Fake data creates an upperbound on the input pipeline throughput.
+  """
+
+  def _shared_params(self):
+    """Returns shared parameters for all trivial benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        model='trivial',
+        num_gpus=8,
+        distortions=False,
+        variable_update='independent',
+        data_dir=self.fake_data_dir)
+
+  def benchmark_fake_64batch(self):
+    params = self._shared_params()._replace(batch_size=64, data_name='imagenet')
+    self._run_benchmark(params)
+
+  def benchmark_fake_128batch(self):
+    params = self._shared_params()._replace(
+        batch_size=128, data_name='imagenet')
+    self._run_benchmark(params)
+
+  def benchmark_fake_256batch(self):
+    params = self._shared_params()._replace(
+        batch_size=256, data_name='imagenet')
+    self._run_benchmark(params)
+
+  def benchmark_fakedistort_128batch(self):
+    params = self._shared_params()._replace(
+        batch_size=128, data_name='imagenet', distortions=True)
+    self._run_benchmark(params)
+
+
+class AlexnetBenchmarks(BenchmarkBase):
+  """"Benchmarks for alexnet."""
+
+  def _shared_params(self):
+    """Returns shared parameters for all alexnet benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        model='alexnet', batch_size=512, distortions=False)
+
+  def benchmark_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data with parameters on gpu."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data with parameters on gpu."""
+    params = self._shared_params()._replace(
+        num_gpus=1, use_fp16=True, variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_synth_8gpu_gpureplicated(self):
+    """Tests 8 gpus with synthetic data with parameters replicated."""
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        compact_gradient_transfer=False,
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  def benchmark_fake_8gpu_gpureplicated(self):
+    """Tests 8 gpus with fake data with parameters replicated."""
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        compact_gradient_transfer=False,
+        gradient_repacking=2)
+    self._run_benchmark(params)
+
+  # XLA Benchmark tests for AlexNet.
+  def benchmark_xla_synth_1gpuparams(self):
+    """Tests 1 gpu with synthetic data and XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with fp16, synthetic data and XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True)
+    self._run_benchmark(params)
+
+  # Test does not run as part of continuous testing.
+  def benchmark_xla_fake_1gpuparams(self):
+    """Tests 1 gpu with fake data and XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_real_1gpuparams(self):
+    """Tests 1 gpu with real data and XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.data_dir,
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+
+class InceptionV3Benchmarks(BenchmarkBase):
+  """"Benchmark for InceptionV3."""
+
+  def _shared_params(self):
+    """Returns shared parameters for all InceptionV3 benchmarks."""
+    return BenchmarkBase._shared_params(self)._replace(
+        model='inception3', batch_size=64, distortions=False)
+
+  def benchmark_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic data."""
+    params = self._shared_params()._replace(
+        num_gpus=1, use_fp16=True, variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_synth_1gpu_max_batch_size(self):
+    """Finds largest batch size that can be run with 1 gpu using synth data."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server')
+    self._binary_search_batch_size(params, init_batch_size=128)
+
+  def benchmark_xla_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with synthetic and  XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
+    """Tests 1 gpu with fp16, XLA and synthetic data."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_synth_1gpu_max_batch_size(self):
+    """Finds largest batch that can be run with XLA, 1 gpu, and synth data."""
+    params = self._shared_params()._replace(
+        num_gpus=1, variable_update='parameter_server', xla=True)
+    self._binary_search_batch_size(params, init_batch_size=128)
+
+  # Test does not run as part of continuous testing.
+  def benchmark_xla_fake_1gpu_gpuparams(self):
+    """Tests 1 gpu with fake data with XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.fake_data_dir,
+        data_name='imagenet',
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_real_1gpu_gpuparams(self):
+    """Tests 1 gpu with real data with XLA."""
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        data_dir=self.data_dir,
+        variable_update='parameter_server',
+        xla=True)
+    self._run_benchmark(params)
+
+
+class NcfBenchmarks(BenchmarkBase):
+  """Benchmarks for neural collaborative filtering."""
+
+  def _shared_params(self):
+    return BenchmarkBase._shared_params(self)._replace(
+        model='ncf', batch_size=64*1024, num_gpus=1, num_warmup_batches=1)
+
+  def benchmark_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_fp16_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        variable_update='parameter_server', use_fp16=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        variable_update='parameter_server', xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        variable_update='parameter_server', xla=True, use_fp16=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_compile_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        variable_update='parameter_server', xla_compile=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_xla_compile_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        variable_update='parameter_server', xla_compile=True, use_fp16=True)
+    self._run_benchmark(params)
+
+
+class DeepSpeech2Benchmarks(BenchmarkBase):
+  """Benchmarks for DeepSpeech2 model."""
+
+  def _shared_params(self):
+    return BenchmarkBase._shared_params(self)._replace(
+        model='deepspeech2', batch_size=32, num_gpus=1, data_name='librispeech')
+
+  def benchmark_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(variable_update='parameter_server')
+    self._run_benchmark(params)
+
+  def benchmark_xla_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        variable_update='parameter_server', xla=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_compile_synth_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        variable_update='parameter_server', xla_compile=True)
+    self._run_benchmark(params)
+
+
+class SsdBenchmarks(BenchmarkBase):
+  """Benchmarks for SSD model."""
+
+  def _cudnn_version(self):
+    if sys.platform == 'win32':
+      return None
+
+    lib = ctypes.cdll.LoadLibrary(None)
+    if hasattr(lib, 'cudnnGetErrorString'):
+      version = lib.cudnnGetVersion()
+      return version
+
+    return None
+
+  def _shared_params(self):
+    cudnn_version = self._cudnn_version()
+    if cudnn_version is None or cudnn_version < 7300:
+      raise RuntimeError(
+          'Needs at least cuDNN 7.3 to work with fp16 (b/112048183). '
+          'Build with --define=use_experimental_cudnn=1')
+
+    return BenchmarkBase._shared_params(self)._replace(
+        # TODO(b/115672206): Replace backbone model and data dir with replicated
+        # placer location for better performance.
+        backbone_model_path=platforms_util.get_ssd_backborn_model_file(),  # pylint: disable=line-too-long
+        data_dir=platforms_util.get_ssd_backboard_data_dir(),
+        batch_size=128,
+        data_name='coco',
+        model='ssd300',
+        num_batches=10,
+        num_warmup_batches=1,
+        num_gpus=1,
+        optimizer='momentum',
+        momentum=0.9,
+        weight_decay=5e-4,
+        loss_type_to_report='base_loss',
+        single_l2_loss_op=True,
+        compute_lr_on_cpu=True,
+    )
+
+  def benchmark_xla_compile_real_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        num_gpus=1,
+        xla_compile=True,
+    )
+    self._run_benchmark(params)
+
+  def benchmark_real_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(num_gpus=1,)
+    self._run_benchmark(params)
+
+  def benchmark_xla_compile_fp16_real_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        num_gpus=1, xla_compile=True, use_fp16=True)
+    self._run_benchmark(params)
+
+  def benchmark_fp16_real_1gpu_gpuparams(self):
+    params = self._shared_params()._replace(num_gpus=1, use_fp16=True)
+    self._run_benchmark(params)
+
+  def benchmark_xla_compile_real_8gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        xla_compile=True,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2,
+        num_batches=50,
+    )
+    self._run_benchmark(params)
+
+  def benchmark_real_8gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2,
+        num_batches=50,
+    )
+    self._run_benchmark(params)
+
+  def benchmark_xla_compile_fp16_real_8gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        xla_compile=True,
+        use_fp16=True,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2,
+        num_batches=50,
+    )
+    self._run_benchmark(params)
+
+  def benchmark_fp16_real_8gpu_gpuparams(self):
+    params = self._shared_params()._replace(
+        num_gpus=8,
+        use_fp16=True,
+        variable_update='replicated',
+        all_reduce_spec='nccl',
+        gradient_repacking=2,
+        num_batches=50,
+    )
+    self._run_benchmark(params)
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/mlperf.py b/cv/classification/resnet50/tensorflow/mlperf.py
new file mode 100644
index 0000000000000000000000000000000000000000..932f3136e1b5d4abb5afefebaf3c9512a7b0ca15
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/mlperf.py
@@ -0,0 +1,260 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains functions related to MLPerf compliance.
+
+MLPerf requires submissions to log what the benchmark does, in order to verify
+that the benchmark meets the MLPerf requirements. This module contains a global
+object `logger` that is used by other files to log what tf_cnn_benchmarks does
+for compliance.
+
+By default, `logger` does nothing, as the MLPerf compliance logs are verbose and
+unnecessary if one is not concerned about MLPerf compliance. The logger can be
+enabled by using the `mlperf_logger` context manager.
+
+To enable the logger with `mlperf_logger`, the MLPerf compliance library at
+https://github.com/mlperf/training/tree/master/compliance is required. If
+the logger is not enabled, the library is not needed.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from collections import namedtuple
+import contextlib
+import os
+import sys
+
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-import-not-at-top
+try:
+  # Not all users have the MLPerf compliance library, so we don't want to
+  # unconditionally crash if these imports fail.
+  from mlperf_compliance import mlperf_log
+  from mlperf_compliance import resnet_log_helper
+  from mlperf_compliance import tags
+  import_successful = True
+except ImportError:
+  # The logger cannot be enabled in this case since the MLPerf library isn't
+  # found. We return empty strings from the `tags` attribute so that
+  # the benchmark can still run without crashing. This empty tags are passed
+  # to an instance of `NullMlPerfLogger`, which does not log anything and
+  # ignores the tag values.
+
+  class _Tags(object):
+
+    def __getattr__(self, item):
+      return ''
+  tags = _Tags()
+  import_successful = False
+# pylint: enable=g-import-not-at-top
+
+
+_ModelInfo = namedtuple('_ModelInfo', ['print_fn', 'tag_set',
+                                       'mlperf_model_name'])
+
+
+_MLPERF_LOG_PREFIX = ':::MLPv0.5.0'
+
+
+class MlPerfLogger(object):
+  """Logs various aspects about a benchmark run for MLPerf compliance."""
+
+  def __init__(self, model):
+    self._root_dir = os.path.split(os.path.abspath(__file__))[0]
+    mlperf_log.ROOT_DIR_RESNET = self._root_dir
+    mlperf_log.ROOT_DIR_SSD = self._root_dir
+    self.model = model
+    model_to_info = {
+        'resnet50_v1.5': _ModelInfo(mlperf_log.resnet_print,
+                                    mlperf_log.RESNET_TAG_SET, tags.RESNET),
+        'ssd300': _ModelInfo(mlperf_log.ssd_print, mlperf_log.SSD_TAG_SET,
+                             tags.SSD)
+    }
+
+    try:
+      self._log_fn, self.tag_set, self.mlperf_model_name = model_to_info[model]
+    except KeyError:
+      raise ValueError('--ml_perf_compliance_logging is only compatible when '
+                       '--model is one of the following: ' +
+                       ', '.join(model_to_info.keys()))
+
+  def log(self, key, value=None, stack_offset=2):
+    if key in self.tag_set:
+      self._log_fn(key, value, stack_offset)
+    else:
+      print('Ignoring MLPerf logging item key=%s, value=%s for model %s' %
+            (key, value, self.model))
+
+  def log_deferred_tensor_value(self, key, tensor_value, global_step,
+                                stack_offset=2, every_n=1):
+    """Logs the value of a tensor when the graph is run."""
+    caller = '(%s)' % mlperf_log.get_caller(stack_offset, self._root_dir)
+    def create_print_op():
+      return tf.print(_MLPERF_LOG_PREFIX, self.mlperf_model_name,
+                      tf.timestamp(), caller, key,
+                      ': { "deferred": true, "value":', tensor_value, '}',
+                      output_stream=sys.stdout)
+    maybe_print = tf.cond(tf.equal(global_step % every_n, 0), create_print_op,
+                          tf.no_op)
+    with tf.control_dependencies([maybe_print]):
+      return tf.identity(tensor_value)
+
+  def log_max_pool(self, input_tensor, output_tensor):
+    if self.model == 'resnet50_v1.5':
+      resnet_log_helper.log_max_pool(input_tensor, output_tensor)
+
+  def log_begin_block(self, input_tensor, block_type):
+    if self.model == 'resnet50_v1.5':
+      resnet_log_helper.log_begin_block(input_tensor, block_type)
+
+  def log_end_block(self, output_tensor):
+    if self.model == 'resnet50_v1.5':
+      resnet_log_helper.log_end_block(output_tensor)
+
+  def log_projection(self, input_tensor, output_tensor):
+    if self.model == 'resnet50_v1.5':
+      resnet_log_helper.log_projection(input_tensor, output_tensor)
+
+  def log_conv2d(self, input_tensor, output_tensor, stride_height, stride_width,
+                 filters, initializer, use_bias):
+    """Log a conv2d call."""
+    if self.model == 'resnet50_v1.5':
+      assert stride_height == stride_width, (
+          '--ml_perf_compliance_logging does not support convolutions where '
+          'the stride height is not equal to the stride width. '
+          'stride_height=%d, stride_width=%d' % (stride_height, stride_width))
+      if isinstance(initializer, tf.truncated_normal_initializer) or (
+          isinstance(initializer, tf.variance_scaling_initializer) and
+          initializer.distribution == 'truncated_normal'):
+        initializer = tags.TRUNCATED_NORMAL
+      elif (isinstance(initializer, tf.glorot_uniform_initializer) or
+            initializer is None):
+        initializer = 'glorot_uniform'
+      resnet_log_helper.log_conv2d(input_tensor, output_tensor, stride_width,
+                                   filters, initializer, use_bias)
+
+  def log_batch_norm(self, input_tensor, output_tensor, momentum, epsilon,
+                     center, scale, training):
+    if self.model == 'resnet50_v1.5':
+      resnet_log_helper.log_batch_norm(input_tensor, output_tensor, momentum,
+                                       epsilon, center, scale, training)
+
+  def log_train_epochs(self, num_epochs):
+    """Logs all the TRAIN_EPOCHs log lines."""
+    num_epochs_int = int(num_epochs)
+    for i in range(num_epochs_int):
+      # MLPerf allows us to print all the train epochs at once instead of
+      # printing them as we do them.
+      self.log(key=mlperf_log.TRAIN_EPOCH, value=i, stack_offset=3)
+    if num_epochs_int != num_epochs:
+      value = (str(num_epochs_int) +
+               ', but this epoch only has {}% of the examples of a normal epoch'
+               .format(100 * (num_epochs - num_epochs_int)))
+      self.log(key=mlperf_log.TRAIN_EPOCH, value=value, stack_offset=3)
+
+  def log_input_resize_aspect_preserving(self, height, width, scale_factor):
+    assert height == width, (
+        '--ml_perf_compliance_logging does not support models with nonsquare '
+        'images. Cannot process image with height=%d and width=%d' %
+        (height, width))
+    self.log(key=tags.INPUT_RESIZE_ASPECT_PRESERVING,
+             value={'min': int(height * scale_factor)})
+
+  def log_eval_epoch(self, tag, global_step, batch_size, stack_offset=2):
+    if self.model == 'resnet50_v1.5':
+      self.log(key=tag, stack_offset=stack_offset+1)
+    elif self.model == 'ssd300':
+      epoch = int(global_step * batch_size / 118287)
+      self.log(key=tag, value=epoch, stack_offset=stack_offset+1)
+
+  def log_eval_accuracy(self, accuracy, global_step, batch_size,
+                        examples_per_epoch, stack_offset=2):
+    """Logs eval accuracy."""
+    epoch = int(global_step * batch_size / examples_per_epoch)
+    eval_accuracy = {'epoch': epoch, 'value': accuracy}
+    eval_iteration_accuracy = {'iteration': global_step, 'value': accuracy}
+    self.log(key=tags.EVAL_ACCURACY, value=eval_accuracy,
+             stack_offset=stack_offset+1)
+    self.log(key=tags.EVAL_ITERATION_ACCURACY,
+             value=eval_iteration_accuracy,
+             stack_offset=stack_offset+1)
+
+
+def _empty_fn(*args, **kwargs):
+  del args, kwargs
+
+
+class NullMlPerfLogger(object):
+  """A version of `MlPerfLogger` that does not log anything.
+
+  This class has the same interface as `MlPerfLogger`, but does not actually do
+  anything. This is used when logging is disabled, which is the default
+  behavior.
+  """
+
+  def __getattr__(self, item):
+    return _empty_fn
+
+  def log_deferred_tensor_value(self, key, tensor_value, *args, **kwargs):
+    del key, args, kwargs
+    return tensor_value
+
+
+# A global singleton logger. By default, it's the null logger but can be
+# switched to an MlPerfLogger with `mlperf_logger()`.
+logger = NullMlPerfLogger()
+
+
+@contextlib.contextmanager
+def mlperf_logger(use_mlperf_logger, model):
+  """Optionally enable the mlperf logger.
+
+  If `use_mlperf_logger` is True, sets the `logger` global variable to an
+  instance of MlPerfLogger that will print logs for MLPerf compliance. If
+  `use_mlperf_logger` is False, does nothing.
+
+  Args:
+    use_mlperf_logger: If True, enables the mlperf logger. If False, this
+      function does nothing.
+    model: The model that will be logged. Required, because different models
+      must log different things for MLPerf compliance.
+
+  Yields:
+    Nothing.
+
+  Raises:
+    ImportError: If `use_mlperf_logger` is True but the MLPerf compliance
+      library cannot be imported
+  """
+  global logger
+  if use_mlperf_logger:
+    if not import_successful:
+      raise ImportError('Failed to import MLPerf compliance library, which is '
+                        'required when --ml_perf_compliance_logging is '
+                        'specified. Clone this repo and add this directory '
+                        'https://github.com/mlperf/training/tree/master/'
+                        'compliance to the PYTHONPATH environmental variable.')
+    logger_ = MlPerfLogger(model)
+    old_logger = logger
+    try:
+      logger = logger_
+      yield
+    finally:
+      logger = old_logger
+  else:
+    yield
diff --git a/cv/classification/resnet50/tensorflow/mlperf_test.py b/cv/classification/resnet50/tensorflow/mlperf_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e83fc29603580b24466c22db2de3732f3d6c13e
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/mlperf_test.py
@@ -0,0 +1,189 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains tests related to MLPerf.
+
+Note this test only passes if the MLPerf compliance library is installed.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import Counter
+import logging
+import re
+
+import six
+import tensorflow.compat.v1 as tf
+import benchmark_cnn
+import datasets
+import mlperf
+import test_util
+from models import model
+from mlperf_compliance import mlperf_log
+
+
+class _MlPerfTestModel(model.CNNModel):
+  """A model to test the MLPerf compliance logging on."""
+
+  def __init__(self):
+    super(_MlPerfTestModel, self).__init__(
+        'mlperf_test_model', image_size=224, batch_size=2, learning_rate=1)
+
+  def add_inference(self, cnn):
+    assert cnn.top_layer.shape[1:] == (3, 224, 224)
+    cnn.conv(1, 1, 1, 1, 1, use_batch_norm=True)
+    cnn.mpool(1, 1, 1, 1, num_channels_in=1)
+    cnn.reshape([-1, 224 * 224])
+    cnn.affine(1, activation=None)
+
+    # Assert that the batch norm variables are filtered out for L2 loss.
+    variables = tf.global_variables() + tf.local_variables()
+    assert len(variables) > len(self.filter_l2_loss_vars(variables))
+
+
+class MlPerfComplianceTest(tf.test.TestCase):
+  """Tests the MLPerf compliance logs.
+
+  This serves as a quick check that we probably didn't break the compliance
+  logging. It is not mean to be as comprehensive as the official MLPerf
+  compliance checker will be.
+  """
+
+  def setUp(self):
+    super(MlPerfComplianceTest, self).setUp()
+    benchmark_cnn.setup(benchmark_cnn.make_params())
+
+  # Map between regex and the number of times we expect to see that regex in the
+  # logs. Entry commented out with the comment FIXME indicate that
+  # tf_cnn_benchmarks currently fails compliance in that regard, and needs to be
+  # fixed to be MLPerf compliant.
+  EXPECTED_LOG_REGEXES = {
+      # Preprocessing tags
+      mlperf.tags.INPUT_ORDER: 2,  # 1 for training, 1 for eval
+      # We pass --tf_random_seed=9876 in the test.
+      r'%s: 9876' % mlperf.tags.RUN_SET_RANDOM_SEED: 2,
+      # The Numpy random seed is hardcoded to 4321.
+      r'%s: 4321' % mlperf.tags.RUN_SET_RANDOM_SEED: 2,
+      r'%s: %d' % (mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES,
+                   datasets.IMAGENET_NUM_TRAIN_IMAGES): 1,
+      r'%s: %d' % (mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES,
+                   datasets.IMAGENET_NUM_VAL_IMAGES): 1,
+      mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES + '.*': 1,
+      mlperf.tags.INPUT_DISTORTED_CROP_MIN_OBJ_COV + '.*': 1,
+      mlperf.tags.INPUT_DISTORTED_CROP_RATIO_RANGE + '.*': 1,
+      mlperf.tags.INPUT_DISTORTED_CROP_AREA_RANGE + '.*': 1,
+      mlperf.tags.INPUT_DISTORTED_CROP_MAX_ATTEMPTS + '.*': 1,
+      mlperf.tags.INPUT_RANDOM_FLIP + '.*': 1,
+      r'%s: \[224, 224\].*' % mlperf.tags.INPUT_CENTRAL_CROP: 1,
+
+      r'%s: \[123.68, 116.78, 103.94\].*' % mlperf.tags.INPUT_MEAN_SUBTRACTION:
+          2,
+
+      r'%s: {"min": 256}.*' % mlperf.tags.INPUT_RESIZE_ASPECT_PRESERVING: 1,
+
+      # 1 for training, 1 for eval
+      r'%s: \[224, 224\].*' % mlperf.tags.INPUT_RESIZE: 2,
+
+      # Resnet model tags
+      mlperf.tags.MODEL_HP_BATCH_NORM + '.*': 2,
+      # 2 for training, 2 for eval. Although there's only 1 conv2d, each conv2d
+      # produces 2 logs.
+      mlperf.tags.MODEL_HP_CONV2D_FIXED_PADDING + '.*': 4,
+      mlperf.tags.MODEL_HP_RELU + '.*': 2,
+      mlperf.tags.MODEL_HP_INITIAL_MAX_POOL + '.*': 2,
+      mlperf.tags.MODEL_HP_DENSE + '.*': 4,
+      mlperf.tags.MODEL_HP_DENSE + '.*': 4,
+
+      # Note that tags our test model does not emit, like MODEL_HP_SHORTCUT_ADD,
+      # are omitted here.
+
+      r'%s: "categorical_cross_entropy".*' % mlperf.tags.MODEL_HP_LOSS_FN: 1,
+
+      # 1 for training, 2 because the _MlPerfTestModel calls this when building
+      # the model for both training and eval
+      r'%s: true' % mlperf.tags.MODEL_EXCLUDE_BN_FROM_L2: 3,
+
+      r'%s: 0.5.*' % mlperf.tags.MODEL_L2_REGULARIZATION: 1,
+
+      # Note we do not handle OPT_LR, since that is printed to stderr using
+      # tf.Print, which we cannot easily intercept.
+
+      # Other tags
+      '%s: "%s"' % (mlperf.tags.OPT_NAME, mlperf.tags.SGD_WITH_MOMENTUM): 1,
+      '%s: 0.5' % mlperf.tags.OPT_MOMENTUM: 1,
+      mlperf.tags.RUN_START: 1,
+      '%s: 2' % mlperf.tags.INPUT_BATCH_SIZE: 1,
+      mlperf.tags.TRAIN_LOOP: 1,
+      mlperf.tags.TRAIN_EPOCH + '.*': 1,
+      '%s: 2' % mlperf.tags.INPUT_SIZE: 2,
+      mlperf.tags.EVAL_START: 2,
+      mlperf.tags.EVAL_STOP: 2,
+      '%s: 6' % mlperf.tags.EVAL_SIZE: 2,
+      mlperf.tags.EVAL_ACCURACY + '.*': 2,
+      '%s: 2.0' % mlperf.tags.EVAL_TARGET: 2,
+      mlperf.tags.RUN_STOP + '.*': 1,
+      mlperf.tags.RUN_FINAL: 1
+  }
+  EXPECTED_LOG_REGEXES = Counter({re.compile(k): v for
+                                  k, v in EXPECTED_LOG_REGEXES.items()})
+
+  def testMlPerfCompliance(self):
+    string_io = six.StringIO()
+    handler = logging.StreamHandler(string_io)
+    data_dir = test_util.create_black_and_white_images()
+    try:
+      mlperf_log.LOGGER.addHandler(handler)
+      params = benchmark_cnn.make_params(data_dir=data_dir,
+                                         data_name='imagenet',
+                                         batch_size=2,
+                                         num_warmup_batches=0,
+                                         num_batches=2,
+                                         num_eval_batches=3,
+                                         eval_during_training_every_n_steps=1,
+                                         distortions=False,
+                                         weight_decay=0.5,
+                                         optimizer='momentum',
+                                         momentum=0.5,
+                                         stop_at_top_1_accuracy=2.0,
+                                         tf_random_seed=9876,
+                                         ml_perf=True)
+      with mlperf.mlperf_logger(use_mlperf_logger=True, model='resnet50_v1.5'):
+        bench_cnn = benchmark_cnn.BenchmarkCNN(params, model=_MlPerfTestModel())
+        bench_cnn.run()
+      logs = string_io.getvalue().splitlines()
+      log_regexes = Counter()
+      for log in logs:
+        for regex in self.EXPECTED_LOG_REGEXES:
+          if regex.search(log):
+            log_regexes[regex] += 1
+      if log_regexes != self.EXPECTED_LOG_REGEXES:
+        diff_counter = Counter(log_regexes)
+        diff_counter.subtract(self.EXPECTED_LOG_REGEXES)
+        differences = []
+        for regex in (k for k in diff_counter.keys() if diff_counter[k]):
+          found_count = log_regexes[regex]
+          expected_count = self.EXPECTED_LOG_REGEXES[regex]
+          differences.append('  For regex %s: Found %d lines matching but '
+                             'expected to find %d' %
+                             (regex.pattern, found_count, expected_count))
+        raise AssertionError('Logs did not match expected logs. Differences:\n'
+                             '%s' % '\n'.join(differences))
+    finally:
+      mlperf_log.LOGGER.removeHandler(handler)
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/models/__init__.py b/cv/classification/resnet50/tensorflow/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/cv/classification/resnet50/tensorflow/models/alexnet_model.py b/cv/classification/resnet50/tensorflow/models/alexnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f4611fd60d19a3dd704e47323e7fa9a5320f596
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/alexnet_model.py
@@ -0,0 +1,93 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Alexnet model configuration.
+
+References:
+  Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton
+  ImageNet Classification with Deep Convolutional Neural Networks
+  Advances in Neural Information Processing Systems. 2012
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from models import model
+
+
+class AlexnetModel(model.CNNModel):
+  """Alexnet cnn model."""
+
+  def __init__(self, params=None):
+    super(AlexnetModel, self).__init__(
+        'alexnet', 224 + 3, 512, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    # Note: VALID requires padding the images by 3 in width and height
+    cnn.conv(64, 11, 11, 4, 4, 'VALID')
+    cnn.mpool(3, 3, 2, 2)
+    cnn.conv(192, 5, 5)
+    cnn.mpool(3, 3, 2, 2)
+    cnn.conv(384, 3, 3)
+    cnn.conv(384, 3, 3)
+    cnn.conv(256, 3, 3)
+    cnn.mpool(3, 3, 2, 2)
+    cnn.reshape([-1, 256 * 6 * 6])
+    cnn.affine(4096)
+    cnn.dropout()
+    cnn.affine(4096)
+    cnn.dropout()
+
+
+class AlexnetCifar10Model(model.CNNModel):
+  """Alexnet cnn model for cifar datasets.
+
+  The model architecture follows the one defined in the tensorflow tutorial
+  model.
+
+  Reference model: tensorflow/models/tutorials/image/cifar10/cifar10.py
+  Paper: http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf
+  """
+
+  def __init__(self, params=None):
+    super(AlexnetCifar10Model, self).__init__(
+        'alexnet', 32, 128, 0.1, params=params)
+
+  def add_inference(self, cnn):
+    cnn.conv(64, 5, 5, 1, 1, 'SAME', stddev=5e-2)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+    cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
+    cnn.conv(64, 5, 5, 1, 1, 'SAME', bias=0.1, stddev=5e-2)
+    cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+    shape = cnn.top_layer.get_shape().as_list()
+    flat_dim = shape[1] * shape[2] * shape[3]
+    cnn.reshape([-1, flat_dim])
+    cnn.affine(384, stddev=0.04, bias=0.1)
+    cnn.affine(192, stddev=0.04, bias=0.1)
+
+  def get_learning_rate(self, global_step, batch_size):
+    num_examples_per_epoch = 50000
+    num_epochs_per_decay = 100
+    decay_steps = (
+        num_epochs_per_decay * num_examples_per_epoch // batch_size)
+    decay_factor = 0.1
+    return tf.train.exponential_decay(
+        self.learning_rate,
+        global_step,
+        decay_steps,
+        decay_factor,
+        staircase=True)
diff --git a/cv/classification/resnet50/tensorflow/models/densenet_model.py b/cv/classification/resnet50/tensorflow/models/densenet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb61b9b3f3332587daa2e308ba6d722cba408e1b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/densenet_model.py
@@ -0,0 +1,100 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Densenet model configuration.
+
+References:
+  "Densely Connected Convolutional Networks": https://arxiv.org/pdf/1608.06993
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+from models import model as model_lib
+
+
+class DensenetCifar10Model(model_lib.CNNModel):
+  """Densenet cnn network configuration."""
+
+  def __init__(self, model, layer_counts, growth_rate, params=None):
+    self.growth_rate = growth_rate
+    super(DensenetCifar10Model, self).__init__(
+        model, 32, 64, 0.1, layer_counts=layer_counts, params=params)
+    self.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True}
+
+  def dense_block(self, cnn, growth_rate):
+    input_layer = cnn.top_layer
+    c = cnn.batch_norm(input_layer, **self.batch_norm_config)
+    c = tf.nn.relu(c)
+    c = cnn.conv(growth_rate, 3, 3, 1, 1, stddev=np.sqrt(2.0/9/growth_rate),
+                 activation=None, input_layer=c)
+    channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
+    cnn.top_layer = tf.concat([input_layer, c], channel_index)
+    cnn.top_size += growth_rate
+
+  def transition_layer(self, cnn):
+    in_size = cnn.top_size
+    cnn.batch_norm(**self.batch_norm_config)
+    cnn.top_layer = tf.nn.relu(cnn.top_layer)
+    cnn.conv(in_size, 1, 1, 1, 1, stddev=np.sqrt(2.0/9/in_size))
+    cnn.apool(2, 2, 2, 2)
+
+  def add_inference(self, cnn):
+    if self.layer_counts is None:
+      raise ValueError('Layer counts not specified for %s' % self.get_model())
+    if self.growth_rate is None:
+      raise ValueError('Growth rate not specified for %s' % self.get_model())
+
+    cnn.conv(16, 3, 3, 1, 1, activation=None)
+    # Block 1
+    for _ in xrange(self.layer_counts[0]):
+      self.dense_block(cnn, self.growth_rate)
+    self.transition_layer(cnn)
+    # Block 2
+    for _ in xrange(self.layer_counts[1]):
+      self.dense_block(cnn, self.growth_rate)
+    self.transition_layer(cnn)
+    # Block 3
+    for _ in xrange(self.layer_counts[2]):
+      self.dense_block(cnn, self.growth_rate)
+    cnn.batch_norm(**self.batch_norm_config)
+    cnn.top_layer = tf.nn.relu(cnn.top_layer)
+    channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
+    cnn.top_size = cnn.top_layer.get_shape().as_list()[channel_index]
+    cnn.spatial_mean()
+
+  def get_learning_rate(self, global_step, batch_size):
+    num_batches_per_epoch = 50000 // batch_size
+    boundaries = num_batches_per_epoch * np.array([150, 225, 300],
+                                                  dtype=np.int64)
+    boundaries = [x for x in boundaries]
+    values = [0.1, 0.01, 0.001, 0.0001]
+    return tf.train.piecewise_constant(global_step, boundaries, values)
+
+
+def create_densenet40_k12_model():
+  return DensenetCifar10Model('densenet40_k12', (12, 12, 12), 12)
+
+
+def create_densenet100_k12_model():
+  return DensenetCifar10Model('densenet100_k12', (32, 32, 32), 12)
+
+
+def create_densenet100_k24_model():
+  return DensenetCifar10Model('densenet100_k24', (32, 32, 32), 24)
diff --git a/cv/classification/resnet50/tensorflow/models/experimental/__init__.py b/cv/classification/resnet50/tensorflow/models/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/cv/classification/resnet50/tensorflow/models/experimental/deepspeech.py b/cv/classification/resnet50/tensorflow/models/experimental/deepspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..24e242f6db9d113a718194df3f9aca45a03da886
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/experimental/deepspeech.py
@@ -0,0 +1,449 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DeepSpeech2 model configuration.
+
+References:
+  https://arxiv.org/abs/1512.02595
+  Deep Speech 2: End-to-End Speech Recognition in English and Mandarin
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+import constants
+from cnn_util import log_fn
+from models import model as model_lib
+from tensorflow.python.ops import variables  # pylint: disable=g-direct-tensorflow-import
+
+
+class DeepSpeechDecoder(object):
+  """Greedy decoder implementation for Deep Speech model."""
+
+  def __init__(self, labels, blank_index=28):
+    """Decoder initialization.
+
+    Arguments:
+      labels: a string specifying the speech labels for the decoder to use.
+      blank_index: an integer specifying index for the blank character. Defaults
+        to 28.
+    """
+    self.labels = labels
+    self.blank_index = blank_index
+    self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
+
+  def convert_to_string(self, sequence):
+    """Convert a sequence of indexes into corresponding string."""
+    return ''.join([self.int_to_char[i] for i in sequence])
+
+  def wer(self, decode, target):
+    """Computes the Word Error Rate (WER).
+
+    WER is defined as the edit distance between the two provided sentences after
+    tokenizing to words.
+
+    Args:
+      decode: string of the decoded output.
+      target: a string for the ground truth label.
+
+    Returns:
+      A float number for the WER of the current decode-target pair.
+    """
+    try:
+      from nltk.metrics import distance  # pylint: disable=g-import-not-at-top
+    except ImportError as e:
+      if 'nltk.metrics' not in e.message:
+        raise
+      raise ImportError('To use the experimental deepspeech model, you must '
+                        'pip install -U nltk')
+
+    # Map each word to a new char.
+    words = set(decode.split() + target.split())
+    word2char = dict(zip(words, range(len(words))))
+
+    new_decode = [chr(word2char[w]) for w in decode.split()]
+    new_target = [chr(word2char[w]) for w in target.split()]
+
+    return distance.edit_distance(''.join(new_decode), ''.join(new_target))
+
+  def cer(self, decode, target):
+    """Computes the Character Error Rate (CER).
+
+    CER is defined as the edit distance between the two given strings.
+
+    Args:
+      decode: a string of the decoded output.
+      target: a string for the ground truth label.
+
+    Returns:
+      A float number denoting the CER for the current sentence pair.
+    """
+    try:
+      from nltk.metrics import distance  # pylint: disable=g-import-not-at-top
+    except ImportError as e:
+      if 'nltk.metrics' not in e.message:
+        raise
+      raise ImportError('To use the experimental deepspeech model, you must '
+                        'pip install -U nltk')
+    return distance.edit_distance(decode, target)
+
+  def decode(self, char_indexes):
+    """Decode the best guess from logits using greedy algorithm."""
+    # Merge repeated chars.
+    merge = [k for k, _ in itertools.groupby(char_indexes)]
+    # Remove the blank index in the decoded sequence.
+    merge_remove_blank = []
+    for k in merge:
+      if k != self.blank_index:
+        merge_remove_blank.append(k)
+
+    return self.convert_to_string(merge_remove_blank)
+
+  def decode_logits(self, logits):
+    """Decode the best guess from logits using greedy algorithm."""
+    # Choose the class with maximimum probability.
+    best = list(np.argmax(logits, axis=1))
+    return self.decode(best)
+
+
+class DeepSpeech2Model(model_lib.Model):
+  """Define DeepSpeech2 model."""
+
+  # Supported rnn cells.
+  SUPPORTED_RNNS = {
+      'lstm': tf.nn.rnn_cell.BasicLSTMCell,
+      'rnn': tf.nn.rnn_cell.RNNCell,
+      'gru': tf.nn.rnn_cell.GRUCell,
+  }
+
+  # Parameters for batch normalization.
+  BATCH_NORM_EPSILON = 1e-5
+  BATCH_NORM_DECAY = 0.997
+
+  # Filters of convolution layer
+  CONV_FILTERS = 32
+
+  def __init__(self,
+               num_rnn_layers=5,
+               rnn_type='lstm',
+               is_bidirectional=True,
+               rnn_hidden_size=800,
+               use_bias=True,
+               params=None):
+    """Initialize DeepSpeech2 model.
+
+    Args:
+      num_rnn_layers: an integer, the number of rnn layers (default: 5).
+      rnn_type: a string, one of the supported rnn cells: gru, rnn or lstm.
+      is_bidirectional: a boolean to indicate if the rnn layer is bidirectional.
+      rnn_hidden_size: an integer for the number of hidden units in the RNN
+        cell.
+      use_bias: a boolean specifying whether to use a bias in the last fc layer.
+      params: the params from BenchmarkCNN.
+    """
+    super(DeepSpeech2Model, self).__init__(
+        'deepspeech2',
+        batch_size=128,
+        learning_rate=0.0005,
+        fp16_loss_scale=128,
+        params=params)
+    self.num_rnn_layers = num_rnn_layers
+    self.rnn_type = rnn_type
+    self.is_bidirectional = is_bidirectional
+    self.rnn_hidden_size = rnn_hidden_size
+    self.use_bias = use_bias
+    self.num_feature_bins = 161
+    self.max_time_steps = 3494
+    self.max_label_length = 576
+
+  def _batch_norm(self, inputs, training):
+    """Batch normalization layer.
+
+    Note that the momentum to use will affect validation accuracy over time.
+    Batch norm has different behaviors during training/evaluation. With a large
+    momentum, the model takes longer to get a near-accurate estimation of the
+    moving mean/variance over the entire training dataset, which means we need
+    more iterations to see good evaluation results. If the training data is
+    evenly distributed over the feature space, we can also try setting a smaller
+    momentum (such as 0.1) to get good evaluation result sooner.
+
+    Args:
+      inputs: input data for batch norm layer.
+      training: a boolean to indicate if it is in training stage.
+
+    Returns:
+      tensor output from batch norm layer.
+    """
+    return tf.layers.batch_normalization(
+        inputs=inputs,
+        momentum=DeepSpeech2Model.BATCH_NORM_DECAY,
+        epsilon=DeepSpeech2Model.BATCH_NORM_EPSILON,
+        fused=True,
+        training=training)
+
+  def _conv_bn_layer(self, inputs, padding, filters, kernel_size, strides,
+                     layer_id, training):
+    """Defines 2D convolutional + batch normalization layer.
+
+    Args:
+      inputs: input data for convolution layer.
+      padding: padding to be applied before convolution layer.
+      filters: an integer, number of output filters in the convolution.
+      kernel_size: a tuple specifying the height and width of the 2D convolution
+        window.
+      strides: a tuple specifying the stride length of the convolution.
+      layer_id: an integer specifying the layer index.
+      training: a boolean to indicate which stage we are in (training/eval).
+
+    Returns:
+      tensor output from the current layer.
+    """
+    # Perform symmetric padding on the feature dimension of time_step
+    # This step is required to avoid issues when RNN output sequence is shorter
+    # than the label length.
+    inputs = tf.pad(
+        inputs,
+        [[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])
+    inputs = tf.layers.conv2d(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding='valid',
+        use_bias=False,
+        activation=tf.nn.relu6,
+        name='cnn_{}'.format(layer_id))
+    return self._batch_norm(inputs, training)
+
+  def _rnn_layer(self, inputs, rnn_cell, rnn_hidden_size, layer_id,
+                 use_batch_norm, is_bidirectional, training):
+    """Defines a batch normalization + rnn layer.
+
+    Args:
+      inputs: input tensors for the current layer.
+      rnn_cell: RNN cell instance to use.
+      rnn_hidden_size: an integer for the dimensionality of the rnn output
+        space.
+      layer_id: an integer for the index of current layer.
+      use_batch_norm: a boolean specifying whether to perform batch
+        normalization on input states.
+      is_bidirectional: a boolean specifying whether the rnn layer is
+        bi-directional.
+      training: a boolean to indicate which stage we are in (training/eval).
+
+    Returns:
+      tensor output for the current layer.
+    """
+    if use_batch_norm:
+      inputs = self._batch_norm(inputs, training)
+
+    # Construct forward/backward RNN cells.
+    fw_cell = rnn_cell(
+        num_units=rnn_hidden_size, name='rnn_fw_{}'.format(layer_id))
+
+    if is_bidirectional:
+      bw_cell = rnn_cell(
+          num_units=rnn_hidden_size, name='rnn_bw_{}'.format(layer_id))
+      outputs, _ = tf.nn.bidirectional_dynamic_rnn(
+          cell_fw=fw_cell,
+          cell_bw=bw_cell,
+          inputs=inputs,
+          dtype=tf.float32,
+          swap_memory=True)
+      rnn_outputs = tf.concat(outputs, -1)
+    else:
+      rnn_outputs = tf.nn.dynamic_rnn(
+          fw_cell, inputs, dtype=tf.float32, swap_memory=True)
+
+    return rnn_outputs
+
+  def get_input_data_types(self, subset):
+    """Returns the list of data types of the inputs."""
+    del subset  # Same data types for both train and validation subsets.
+    return [self.data_type, tf.int32, tf.int32, tf.int32]
+
+  def get_input_shapes(self, subset):
+    """Returns the list of shapes of the padded inputs."""
+    del subset  # Same shapes for both train and validation subsets
+    return [
+        [self.batch_size, self.max_time_steps, self.num_feature_bins, 1],
+        [self.batch_size, self.max_label_length],
+        [self.batch_size, 1],
+        [self.batch_size, 1],
+    ]
+
+  def get_synthetic_inputs(self, input_name, nclass):
+    inputs = tf.random_uniform(self.get_input_shapes('train')[0],
+                               dtype=self.get_input_data_types('train')[0])
+    inputs = variables.VariableV1(inputs, trainable=False,
+                                  collections=[tf.GraphKeys.LOCAL_VARIABLES],
+                                  name=input_name)
+    labels = tf.convert_to_tensor(
+        np.random.randint(28, size=[self.batch_size, self.max_label_length]))
+    input_lengths = tf.convert_to_tensor(
+        [self.max_time_steps] * self.batch_size)
+    label_lengths = tf.convert_to_tensor(
+        [self.max_label_length] * self.batch_size)
+    return [inputs, labels, input_lengths, label_lengths]
+
+  # TODO(laigd): support fp16.
+  # TODO(laigd): support multiple gpus.
+  def build_network(self, inputs, phase_train=True, nclass=29):
+    """Builds the forward pass of the deepspeech2 model.
+
+    Args:
+      inputs: The input list of the model.
+      phase_train: True during training. False during evaluation.
+      nclass: Number of classes that the input spectrogram can belong to.
+
+    Returns:
+      A BuildNetworkResult which contains the logits and model-specific extra
+        information.
+    """
+    inputs = inputs[0]  # Get the spectrogram feature.
+
+    # Two cnn layers.
+    inputs = self._conv_bn_layer(
+        inputs,
+        padding=(20, 5),
+        filters=DeepSpeech2Model.CONV_FILTERS,
+        kernel_size=(41, 11),
+        strides=(2, 2),
+        layer_id=1,
+        training=phase_train)
+
+    inputs = self._conv_bn_layer(
+        inputs,
+        padding=(10, 5),
+        filters=DeepSpeech2Model.CONV_FILTERS,
+        kernel_size=(21, 11),
+        strides=(2, 1),
+        layer_id=2,
+        training=phase_train)
+
+    # output of conv_layer2 with the shape of
+    # [batch_size (N), times (T), features (F), channels (C)].
+    # Convert the conv output to rnn input.
+
+    # batch_size = tf.shape(inputs)[0]
+    feat_size = inputs.get_shape().as_list()[2]
+    inputs = tf.reshape(
+        inputs,
+        [self.batch_size, -1, feat_size * DeepSpeech2Model.CONV_FILTERS])
+
+    # RNN layers.
+    rnn_cell = DeepSpeech2Model.SUPPORTED_RNNS[self.rnn_type]
+    for layer_counter in xrange(self.num_rnn_layers):
+      # No batch normalization on the first layer.
+      use_batch_norm = (layer_counter != 0)
+      inputs = self._rnn_layer(inputs, rnn_cell, self.rnn_hidden_size,
+                               layer_counter + 1, use_batch_norm,
+                               self.is_bidirectional, phase_train)
+
+    # FC layer with batch norm.
+    inputs = self._batch_norm(inputs, phase_train)
+    logits = tf.layers.dense(inputs, nclass, use_bias=self.use_bias)
+
+    return model_lib.BuildNetworkResult(logits=logits, extra_info=None)
+
+  def loss_function(self, inputs, build_network_result):
+    """Computes the ctc loss for the current batch of predictions.
+
+    Args:
+      inputs: the input list of the model.
+      build_network_result: a BuildNetworkResult returned by build_network().
+
+    Returns:
+      The loss tensor of the model.
+    """
+    logits = build_network_result.logits
+    actual_time_steps = inputs[2]
+    probs = tf.nn.softmax(logits)
+    ctc_time_steps = tf.shape(probs)[1]
+    ctc_input_length = tf.to_float(
+        tf.multiply(actual_time_steps, ctc_time_steps))
+    ctc_input_length = tf.to_int32(
+        tf.floordiv(ctc_input_length, tf.to_float(self.max_time_steps)))
+
+    label_length = inputs[3]
+    label_length = tf.to_int32(tf.squeeze(label_length))
+    ctc_input_length = tf.to_int32(tf.squeeze(ctc_input_length))
+
+    labels = inputs[1]
+    sparse_labels = tf.to_int32(
+        tf.keras.backend.ctc_label_dense_to_sparse(labels, label_length))
+    y_pred = tf.log(
+        tf.transpose(probs, perm=[1, 0, 2]) + tf.keras.backend.epsilon())
+
+    losses = tf.expand_dims(
+        tf.nn.ctc_loss(
+            labels=sparse_labels,
+            inputs=y_pred,
+            sequence_length=ctc_input_length,
+            ignore_longer_outputs_than_inputs=True),
+        axis=1)
+    loss = tf.reduce_mean(losses)
+    return loss
+
+  PROBABILITY_TENSOR = 'deepspeech2_prob'
+  LABEL_TENSOR = 'deepspeech2_label'
+
+  def accuracy_function(self, inputs, logits):
+    """Returns the ops to evaluate the model performance."""
+    # Get probabilities of each predicted class
+    probs = tf.nn.softmax(logits)
+    assert probs.shape.as_list()[0] == self.batch_size
+    return {
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX + self.PROBABILITY_TENSOR):
+            probs,
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX + self.LABEL_TENSOR):
+            inputs[1],
+    }
+
+  def postprocess(self, results):
+    """Postprocess results returned from model in Python."""
+    probs = results[self.PROBABILITY_TENSOR]
+
+    total_wer, total_cer = 0, 0
+    speech_labels = " abcdefghijklmnopqrstuvwxyz'-"
+    greedy_decoder = DeepSpeechDecoder(speech_labels)
+
+    # Evaluate the performance using WER (Word Error Rate) and CER (Character
+    # Error Rate) as metrics.
+    targets = results[self.LABEL_TENSOR]  # The ground truth transcript
+    for i in range(self.batch_size):
+      # Decode string.
+      predicted_str = greedy_decoder.decode_logits(probs[i])
+      expected_str = greedy_decoder.decode(targets[i])
+      # Compute CER.
+      total_cer += (greedy_decoder.cer(predicted_str, expected_str) /
+                    len(expected_str))
+      # Compute WER.
+      total_wer += (greedy_decoder.wer(predicted_str, expected_str) /
+                    len(expected_str.split()))
+
+    # Get mean value
+    total_cer /= self.batch_size
+    total_wer /= self.batch_size
+
+    log_fn('total CER: {:f}; total WER: {:f}; total example: {:d}.'.format(
+        total_cer, total_wer, self.batch_size))
+    # TODO(laigd): get rid of top_N_accuracy bindings in benchmark_cnn.py
+    return {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
diff --git a/cv/classification/resnet50/tensorflow/models/experimental/official_ncf_model.py b/cv/classification/resnet50/tensorflow/models/experimental/official_ncf_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e6ca513f9f0c3f9b7c67bc7a072ed0b35fd4f5a
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/experimental/official_ncf_model.py
@@ -0,0 +1,172 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrap the official recommendation model in a tf_cnn_benchmarks Model.
+
+This allows the recommendation NCF model to be used in tf_cnn_benchmarks.
+Currently, the implementation is fairly hacky, because tf_cnn_benchmarks is
+intended to be used only with CNNs.
+
+Only synthetic data with 1 GPU is currently supported.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+from models import model
+
+
+# Obtained by running the official NCF model with the following command:
+#     python ncf_main.py  --dataset ml-20m
+# and printing the number of users and items here:
+# https://github.com/tensorflow/models/blob/d089975f630a8a01be63e45ef08a31be14bb96b4/official/recommendation/data_preprocessing.py#L68
+_NUM_USERS_20M = 138493
+_NUM_ITEMS_20M = 26744
+
+
+# TODO(reedwm): Support multi-GPU. Currently keras layers, which this model
+# uses, ignore variable_scopes, which we rely on for multi-GPU support.
+# TODO(reedwm): Support real data. This will require a significant refactor.
+# TODO(reedwm): All-reduce IndexedSlices more effectively.
+# TODO(reedwm): Support the 1M variant of this model.
+
+
+class NcfModel(model.Model):
+  r"""A model.Model wrapper around the official NCF recommendation model.
+
+  To do an NCF run with synthetic data that roughly matches what the official
+  model does, run:
+
+  python tf_cnn_benchmarks.py --optimizer=adam --model=ncf --batch_size=65536 \
+      --weight_decay=0 --sparse_to_dense_grads
+  """
+
+  def __init__(self, params=None):
+    super(NcfModel, self).__init__(
+        'official_ncf', batch_size=2048, learning_rate=0.0005,
+        fp16_loss_scale=128, params=params)
+    if self.fp16_vars:
+      raise ValueError('NCF model only supports float32 variables for now.')
+
+  def build_network(self, inputs, phase_train=True, nclass=1001):
+    try:
+      from official.recommendation import neumf_model  # pylint: disable=g-import-not-at-top
+    except ImportError as e:
+      if 'neumf_model' not in e.message:
+        raise
+      raise ImportError('To use the experimental NCF model, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models to the PYTHONPATH.')
+    del nclass
+
+    users, items, _ = inputs
+    params = {
+        'num_users': _NUM_USERS_20M,
+        'num_items': _NUM_ITEMS_20M,
+        'model_layers': (256, 256, 128, 64),
+        'mf_dim': 64,
+        'mf_regularization': 0,
+        'mlp_reg_layers': (0, 0, 0, 0),
+        'use_tpu': False
+    }
+    user_input = tf.keras.layers.Input(tensor=users, name='user_input')
+    item_input = tf.keras.layers.Input(tensor=items, name='item_input')
+    if self.data_type == tf.float32:
+      keras_model = neumf_model.construct_model(user_input, item_input, params)
+      logits = keras_model.output
+    else:
+      assert self.data_type == tf.float16
+      old_floatx = tf.keras.backend.floatx()
+      try:
+        tf.keras.backend.set_floatx('float16')
+        # We cannot rely on the variable_scope's fp16 custom getter here,
+        # because the NCF model uses keras layers, which ignore variable scopes.
+        # So we use a variable_creator_scope instead.
+        with tf.variable_creator_scope(_fp16_variable_creator):
+          keras_model = neumf_model.construct_model(user_input, item_input,
+                                                    params)
+        logits = tf.cast(keras_model.output, tf.float32)
+      finally:
+        tf.keras.backend.set_floatx(old_floatx)
+    return model.BuildNetworkResult(logits=logits, extra_info=None)
+
+  def loss_function(self, inputs, build_network_result):
+    logits = build_network_result.logits
+
+    # Softmax with the first column of ones is equivalent to sigmoid.
+    # TODO(reedwm): Actually, the first column should be zeros to be equivalent
+    # to sigmoid. But, we keep it at ones to match the official models.
+    logits = tf.concat([tf.ones(logits.shape, dtype=logits.dtype), logits],
+                       axis=1)
+
+    return tf.losses.sparse_softmax_cross_entropy(
+        labels=inputs[2],
+        logits=logits
+    )
+
+  def get_synthetic_inputs(self, input_name, nclass):
+    """Returns the ops to generate synthetic inputs and labels."""
+    def users_init_val():
+      return tf.random_uniform((self.batch_size, 1), minval=0,
+                               maxval=_NUM_USERS_20M, dtype=tf.int32)
+    users = tf.Variable(users_init_val, dtype=tf.int32, trainable=False,
+                        collections=[tf.GraphKeys.LOCAL_VARIABLES],
+                        name='synthetic_users')
+    def items_init_val():
+      return tf.random_uniform((self.batch_size, 1), minval=0,
+                               maxval=_NUM_ITEMS_20M, dtype=tf.int32)
+    items = tf.Variable(items_init_val, dtype=tf.int32, trainable=False,
+                        collections=[tf.GraphKeys.LOCAL_VARIABLES],
+                        name='synthetic_items')
+
+    def labels_init_val():
+      return tf.random_uniform((self.batch_size,), minval=0, maxval=2,
+                               dtype=tf.int32)
+    labels = tf.Variable(labels_init_val, dtype=tf.int32, trainable=False,
+                         collections=[tf.GraphKeys.LOCAL_VARIABLES],
+                         name='synthetic_labels')
+
+    return [users, items, labels]
+
+  def get_input_shapes(self, subset):
+    del subset
+    return [[self.batch_size, 1], [self.batch_size, 1], [self.batch_size]]
+
+  def get_input_data_types(self, subset):
+    del subset
+    return [self.int32, tf.int32, tf.int32]
+
+
+def _fp16_variable_creator(next_creator, **kwargs):
+  """Variable creator to create variables in fp32 and cast them to fp16."""
+  dtype = kwargs.get('dtype', None)
+  initial_value = kwargs.get('initial_value', None)
+  if dtype is None:
+    if initial_value is not None and not callable(initial_value):
+      dtype = initial_value.dtype
+  if dtype == tf.float16:
+    if callable(initial_value):
+      new_initial_value = lambda: tf.cast(initial_value(), tf.float32)
+    else:
+      new_initial_value = tf.cast(initial_value, tf.float32)
+    kwargs['dtype'] = tf.float32
+    kwargs['initial_value'] = new_initial_value
+    var = next_creator(**kwargs)
+    return tf.cast(var, dtype=tf.float16)
+  else:
+    return next_creator(**kwargs)
+
diff --git a/cv/classification/resnet50/tensorflow/models/googlenet_model.py b/cv/classification/resnet50/tensorflow/models/googlenet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3505594ec933cc05cb96b00eeac81cbc4334693c
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/googlenet_model.py
@@ -0,0 +1,63 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Googlenet model configuration.
+
+References:
+  Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+  Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich
+  Going deeper with convolutions
+  arXiv preprint arXiv:1409.4842 (2014)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from models import model
+
+
+class GooglenetModel(model.CNNModel):
+  """GoogLeNet."""
+
+  def __init__(self, params=None):
+    super(GooglenetModel, self).__init__(
+        'googlenet', 224, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+
+    def inception_v1(cnn, k, l, m, n, p, q):
+      cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)],
+              [('conv', n, 1, 1), ('conv', p, 5, 5)],
+              [('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]]
+      cnn.inception_module('incept_v1', cols)
+
+    cnn.conv(64, 7, 7, 2, 2)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+    cnn.conv(64, 1, 1)
+    cnn.conv(192, 3, 3)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+    inception_v1(cnn, 64, 96, 128, 16, 32, 32)
+    inception_v1(cnn, 128, 128, 192, 32, 96, 64)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+    inception_v1(cnn, 192, 96, 208, 16, 48, 64)
+    inception_v1(cnn, 160, 112, 224, 24, 64, 64)
+    inception_v1(cnn, 128, 128, 256, 24, 64, 64)
+    inception_v1(cnn, 112, 144, 288, 32, 64, 64)
+    inception_v1(cnn, 256, 160, 320, 32, 128, 128)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+    inception_v1(cnn, 256, 160, 320, 32, 128, 128)
+    inception_v1(cnn, 384, 192, 384, 48, 128, 128)
+    cnn.apool(7, 7, 1, 1, mode='VALID')
+    cnn.reshape([-1, 1024])
diff --git a/cv/classification/resnet50/tensorflow/models/inception_model.py b/cv/classification/resnet50/tensorflow/models/inception_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8835edb88cb57fde2b67bc8cb5fb2caffa0527f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/inception_model.py
@@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inception model configuration.
+
+Includes multiple models: inception3, inception4, inception-resnet2.
+
+References:
+  Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
+  Inception-v4, Inception-ResNet and the Impact of Residual Connections on
+  Learning
+
+  Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+  Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich
+  Going Deeper with Convolutions
+  http://arxiv.org/pdf/1409.4842v1.pdf
+
+  Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens,
+  Zbigniew Wojna
+  Rethinking the Inception Architecture for Computer Vision
+  arXiv preprint arXiv:1512.00567 (2015)
+
+  Inception v3 model: http://arxiv.org/abs/1512.00567
+
+  Inception v4 and Resnet V2 architectures: http://arxiv.org/abs/1602.07261
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from models import model
+
+
+class Inceptionv3Model(model.CNNModel):
+  """InceptionV3."""
+
+  def __init__(self, auxiliary=False, params=None):
+    self._auxiliary = auxiliary
+    super(Inceptionv3Model, self).__init__(
+        'inception3', 299, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    def inception_v3_a(cnn, n):
+      cols = [[('conv', 64, 1, 1)], [('conv', 48, 1, 1), ('conv', 64, 5, 5)],
+              [('conv', 64, 1, 1), ('conv', 96, 3, 3), ('conv', 96, 3, 3)],
+              [('apool', 3, 3, 1, 1, 'SAME'), ('conv', n, 1, 1)]]
+      cnn.inception_module('incept_v3_a', cols)
+
+    def inception_v3_b(cnn):
+      cols = [[('conv', 384, 3, 3, 2, 2, 'VALID')],
+              [('conv', 64, 1, 1),
+               ('conv', 96, 3, 3),
+               ('conv', 96, 3, 3, 2, 2, 'VALID')],
+              [('mpool', 3, 3, 2, 2, 'VALID')]]
+      cnn.inception_module('incept_v3_b', cols)
+
+    def inception_v3_c(cnn, n):
+      cols = [[('conv', 192, 1, 1)],
+              [('conv', n, 1, 1), ('conv', n, 1, 7), ('conv', 192, 7, 1)],
+              [('conv', n, 1, 1), ('conv', n, 7, 1), ('conv', n, 1, 7),
+               ('conv', n, 7, 1), ('conv', 192, 1, 7)],
+              [('apool', 3, 3, 1, 1, 'SAME'), ('conv', 192, 1, 1)]]
+      cnn.inception_module('incept_v3_c', cols)
+
+    def inception_v3_d(cnn):
+      cols = [[('conv', 192, 1, 1), ('conv', 320, 3, 3, 2, 2, 'VALID')],
+              [('conv', 192, 1, 1), ('conv', 192, 1, 7), ('conv', 192, 7, 1),
+               ('conv', 192, 3, 3, 2, 2, 'VALID')],
+              [('mpool', 3, 3, 2, 2, 'VALID')]]
+      cnn.inception_module('incept_v3_d', cols)
+
+    def inception_v3_e(cnn, pooltype):
+      cols = [[('conv', 320, 1, 1)], [('conv', 384, 1, 1), ('conv', 384, 1, 3)],
+              [('share',), ('conv', 384, 3, 1)],
+              [('conv', 448, 1, 1), ('conv', 384, 3, 3), ('conv', 384, 1, 3)],
+              [('share',), ('share',), ('conv', 384, 3, 1)],
+              [('mpool' if pooltype == 'max' else 'apool', 3, 3, 1, 1, 'SAME'),
+               ('conv', 192, 1, 1)]]
+      cnn.inception_module('incept_v3_e', cols)
+
+    def incept_v3_aux(cnn):
+      assert cnn.aux_top_layer is None
+      cnn.aux_top_layer = cnn.top_layer
+      cnn.aux_top_size = cnn.top_size
+      with cnn.switch_to_aux_top_layer():
+        cnn.apool(5, 5, 3, 3, mode='VALID')
+        cnn.conv(128, 1, 1, mode='SAME')
+        cnn.conv(768, 5, 5, mode='VALID', stddev=0.01)
+        cnn.reshape([-1, 768])
+
+    cnn.use_batch_norm = True
+    cnn.conv(32, 3, 3, 2, 2, mode='VALID')   # 299 x 299 x 3
+    cnn.conv(32, 3, 3, 1, 1, mode='VALID')   # 149 x 149 x 32
+    cnn.conv(64, 3, 3, 1, 1, mode='SAME')    # 147 x 147 x 64
+    cnn.mpool(3, 3, 2, 2, mode='VALID')      # 147 x 147 x 64
+    cnn.conv(80, 1, 1, 1, 1, mode='VALID')   # 73 x 73 x 80
+    cnn.conv(192, 3, 3, 1, 1, mode='VALID')  # 71 x 71 x 192
+    cnn.mpool(3, 3, 2, 2, 'VALID')           # 35 x 35 x 192
+    inception_v3_a(cnn, 32)                  # 35 x 35 x 256 mixed.
+    inception_v3_a(cnn, 64)                  # 35 x 35 x 288 mixed_1.
+    inception_v3_a(cnn, 64)                  # 35 x 35 x 288 mixed_2
+    inception_v3_b(cnn)                      # 17 x 17 x 768 mixed_3
+    inception_v3_c(cnn, 128)                 # 17 x 17 x 768 mixed_4
+    inception_v3_c(cnn, 160)                 # 17 x 17 x 768 mixed_5
+    inception_v3_c(cnn, 160)                 # 17 x 17 x 768 mixed_6
+    inception_v3_c(cnn, 192)                 # 17 x 17 x 768 mixed_7
+    if self._auxiliary:
+      incept_v3_aux(cnn)                     # Auxillary Head logits
+    inception_v3_d(cnn)                      # 17 x 17 x 1280 mixed_8
+    inception_v3_e(cnn, 'avg')               # 8 x 8 x 2048 mixed_9
+    inception_v3_e(cnn, 'max')               # 8 x 8 x 2048 mixed_10
+    cnn.apool(8, 8, 1, 1, 'VALID')           # 8 x 8 x 2048
+    cnn.reshape([-1, 2048])                  # 1 x 1 x 2048
+
+
+# Stem functions
+def inception_v4_sa(cnn):
+  cols = [[('mpool', 3, 3, 2, 2, 'VALID')], [('conv', 96, 3, 3, 2, 2, 'VALID')]]
+  cnn.inception_module('incept_v4_sa', cols)
+
+
+def inception_v4_sb(cnn):
+  cols = [[('conv', 64, 1, 1), ('conv', 96, 3, 3, 1, 1, 'VALID')],
+          [('conv', 64, 1, 1), ('conv', 64, 7, 1), ('conv', 64, 1, 7),
+           ('conv', 96, 3, 3, 1, 1, 'VALID')]]
+  cnn.inception_module('incept_v4_sb', cols)
+
+
+def inception_v4_sc(cnn):
+  cols = [[('conv', 192, 3, 3, 2, 2, 'VALID')],
+          [('mpool', 3, 3, 2, 2, 'VALID')]]
+  cnn.inception_module('incept_v4_sc', cols)
+
+
+# Reduction functions
+def inception_v4_ra(cnn, k, l, m, n):
+  cols = [
+      [('mpool', 3, 3, 2, 2, 'VALID')], [('conv', n, 3, 3, 2, 2, 'VALID')],
+      [('conv', k, 1, 1), ('conv', l, 3, 3), ('conv', m, 3, 3, 2, 2, 'VALID')]
+  ]
+  cnn.inception_module('incept_v4_ra', cols)
+
+
+def inception_v4_rb(cnn):
+  cols = [[('mpool', 3, 3, 2, 2, 'VALID')],
+          [('conv', 192, 1, 1), ('conv', 192, 3, 3, 2, 2, 'VALID')],
+          [('conv', 256, 1, 1), ('conv', 256, 1, 7), ('conv', 320, 7, 1),
+           ('conv', 320, 3, 3, 2, 2, 'VALID')]]
+  cnn.inception_module('incept_v4_rb', cols)
+
+
+class Inceptionv4Model(model.CNNModel):
+  """Inceptionv4."""
+
+  def __init__(self, params=None):
+    super(Inceptionv4Model, self).__init__(
+        'inception4', 299, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    def inception_v4_a(cnn):
+      cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 96, 1, 1)],
+              [('conv', 96, 1, 1)], [('conv', 64, 1, 1), ('conv', 96, 3, 3)],
+              [('conv', 64, 1, 1), ('conv', 96, 3, 3), ('conv', 96, 3, 3)]]
+      cnn.inception_module('incept_v4_a', cols)
+
+    def inception_v4_b(cnn):
+      cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 128, 1, 1)],
+              [('conv', 384, 1, 1)],
+              [('conv', 192, 1, 1), ('conv', 224, 1, 7), ('conv', 256, 7, 1)],
+              [('conv', 192, 1, 1), ('conv', 192, 1, 7), ('conv', 224, 7, 1),
+               ('conv', 224, 1, 7), ('conv', 256, 7, 1)]]
+      cnn.inception_module('incept_v4_b', cols)
+
+    def inception_v4_c(cnn):
+      cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 256, 1, 1)],
+              [('conv', 256, 1, 1)], [('conv', 384, 1, 1), ('conv', 256, 1, 3)],
+              [('share',), ('conv', 256, 3, 1)],
+              [('conv', 384, 1, 1), ('conv', 448, 1, 3), ('conv', 512, 3, 1),
+               ('conv', 256, 3, 1)], [('share',), ('share',), ('share',),
+                                      ('conv', 256, 1, 3)]]
+      cnn.inception_module('incept_v4_c', cols)
+
+    cnn.use_batch_norm = True
+    cnn.conv(32, 3, 3, 2, 2, mode='VALID')
+    cnn.conv(32, 3, 3, 1, 1, mode='VALID')
+    cnn.conv(64, 3, 3)
+    inception_v4_sa(cnn)
+    inception_v4_sb(cnn)
+    inception_v4_sc(cnn)
+    for _ in xrange(4):
+      inception_v4_a(cnn)
+    inception_v4_ra(cnn, 192, 224, 256, 384)
+    for _ in xrange(7):
+      inception_v4_b(cnn)
+    inception_v4_rb(cnn)
+    for _ in xrange(3):
+      inception_v4_c(cnn)
+    cnn.spatial_mean()
+    cnn.dropout(0.8)
diff --git a/cv/classification/resnet50/tensorflow/models/lenet_model.py b/cv/classification/resnet50/tensorflow/models/lenet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..0218daaeb2b016b7bfcc886af813e92aee25f521
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/lenet_model.py
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Lenet model configuration.
+
+References:
+  LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner
+  Gradient-based learning applied to document recognition
+  Proceedings of the IEEE (1998)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from models import model
+
+
+class Lenet5Model(model.CNNModel):
+  """Lenet5."""
+
+  def __init__(self, params=None):
+    super(Lenet5Model, self).__init__('lenet5', 28, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    # Note: This matches TF's MNIST tutorial model
+    cnn.conv(32, 5, 5)
+    cnn.mpool(2, 2)
+    cnn.conv(64, 5, 5)
+    cnn.mpool(2, 2)
+    cnn.reshape([-1, 64 * 7 * 7])
+    cnn.affine(512)
diff --git a/cv/classification/resnet50/tensorflow/models/model.py b/cv/classification/resnet50/tensorflow/models/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3db13081917f9582704428c6c26956cbd652ae77
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/model.py
@@ -0,0 +1,340 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base model configuration for CNN benchmarks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+
+import tensorflow.compat.v1 as tf
+
+import convnet_builder
+import mlperf
+from tensorflow.python.ops import variables as variables_module  # pylint: disable=g-direct-tensorflow-import
+
+# BuildNetworkResult encapsulate the result (e.g. logits) of a
+# Model.build_network() call.
+BuildNetworkResult = namedtuple(
+    'BuildNetworkResult',
+    [
+        'logits',  # logits of the network
+        'extra_info',  # Model specific extra information
+    ])
+
+
+class Model(object):
+  """Base model config for DNN benchmarks."""
+
+  def __init__(self,
+               model_name,
+               batch_size,
+               learning_rate,
+               fp16_loss_scale,
+               params=None):
+    self.model_name = model_name
+    self.batch_size = batch_size
+    self.default_batch_size = batch_size
+    self.learning_rate = learning_rate
+    # TODO(reedwm) Set custom loss scales for each model instead of using the
+    # default of 128.
+    self.fp16_loss_scale = fp16_loss_scale
+
+    # use_tf_layers specifies whether to build the model using tf.layers.
+    # fp16_vars specifies whether to create the variables in float16.
+    if params:
+      self.use_tf_layers = params.use_tf_layers
+      self.fp16_vars = params.fp16_vars
+      self.data_type = tf.float16 if params.use_fp16 else tf.float32
+    else:
+      self.use_tf_layers = True
+      self.fp16_vars = False
+      self.data_type = tf.float32
+
+  def get_model_name(self):
+    return self.model_name
+
+  def get_batch_size(self):
+    return self.batch_size
+
+  def set_batch_size(self, batch_size):
+    self.batch_size = batch_size
+
+  def get_default_batch_size(self):
+    return self.default_batch_size
+
+  def get_fp16_loss_scale(self):
+    return self.fp16_loss_scale
+
+  def filter_l2_loss_vars(self, variables):
+    """Filters out variables that the L2 loss should not be computed for.
+
+    By default, this filters out batch normalization variables and keeps all
+    other variables. This behavior can be overridden by subclasses.
+
+    Args:
+      variables: A list of the trainable variables.
+
+    Returns:
+      A list of variables that the L2 loss should be computed for.
+    """
+    mlperf.logger.log(key=mlperf.tags.MODEL_EXCLUDE_BN_FROM_L2,
+                      value=True)
+    return [v for v in variables if 'batchnorm' not in v.name]
+
+  def get_learning_rate(self, global_step, batch_size):
+    del global_step
+    del batch_size
+    return self.learning_rate
+
+  def get_input_shapes(self, subset):
+    """Returns the list of expected shapes of all the inputs to this model."""
+    del subset
+    raise NotImplementedError('Must be implemented in derived classes')
+
+  def get_input_data_types(self, subset):
+    """Returns the list of data types of all the inputs to this model."""
+    del subset
+    raise NotImplementedError('Must be implemented in derived classes')
+
+  def get_synthetic_inputs(self, input_name, nclass):
+    """Returns the ops to generate synthetic inputs."""
+    raise NotImplementedError('Must be implemented in derived classes')
+
+  def build_network(self, inputs, phase_train, nclass):
+    """Builds the forward pass of the model.
+
+    Args:
+      inputs: The list of inputs, including labels
+      phase_train: True during training. False during evaluation.
+      nclass: Number of classes that the inputs can belong to.
+
+    Returns:
+      A BuildNetworkResult which contains the logits and model-specific extra
+        information.
+    """
+    raise NotImplementedError('Must be implemented in derived classes')
+
+  def loss_function(self, inputs, build_network_result):
+    """Returns the op to measure the loss of the model.
+
+    Args:
+      inputs: the input list of the model.
+      build_network_result: a BuildNetworkResult returned by build_network().
+
+    Returns:
+      The loss tensor of the model.
+    """
+    raise NotImplementedError('Must be implemented in derived classes')
+
+  # TODO(laigd): have accuracy_function() take build_network_result instead.
+  def accuracy_function(self, inputs, logits):
+    """Returns the ops to measure the accuracy of the model."""
+    raise NotImplementedError('Must be implemented in derived classes')
+
+  def postprocess(self, results):
+    """Postprocess results returned from model in Python."""
+    return results
+
+  def reached_target(self):
+    """Define custom methods to stop training when model's target is reached."""
+    return False
+
+
+class CNNModel(Model):
+  """Base model configuration for CNN benchmarks."""
+
+  # TODO(laigd): reduce the number of parameters and read everything from
+  # params.
+  def __init__(self,
+               model,
+               image_size,
+               batch_size,
+               learning_rate,
+               layer_counts=None,
+               fp16_loss_scale=128,
+               params=None):
+    super(CNNModel, self).__init__(
+        model, batch_size, learning_rate, fp16_loss_scale,
+        params=params)
+    self.image_size = image_size
+    self.layer_counts = layer_counts
+    self.depth = 3
+    self.params = params
+    self.data_format = params.data_format if params else 'NCHW'
+
+  def get_layer_counts(self):
+    return self.layer_counts
+
+  def skip_final_affine_layer(self):
+    """Returns if the caller of this class should skip the final affine layer.
+
+    Normally, this class adds a final affine layer to the model after calling
+    self.add_inference(), to generate the logits. If a subclass override this
+    method to return True, the caller should not add the final affine layer.
+
+    This is useful for tests.
+    """
+    return False
+
+  def add_backbone_saver(self):
+    """Creates a tf.train.Saver as self.backbone_saver for loading backbone.
+
+    A tf.train.Saver must be created and saved in self.backbone_saver before
+    calling load_backbone_model, with correct variable name mapping to load
+    variables from checkpoint correctly into the current model.
+    """
+    raise NotImplementedError(self.getName() + ' does not have backbone model.')
+
+  def load_backbone_model(self, sess, backbone_model_path):
+    """Loads variable values from a pre-trained backbone model.
+
+    This should be used at the beginning of the training process for transfer
+    learning models using checkpoints of base models.
+
+    Args:
+      sess: session to train the model.
+      backbone_model_path: path to backbone model checkpoint file.
+    """
+    del sess, backbone_model_path
+    raise NotImplementedError(self.getName() + ' does not have backbone model.')
+
+  def add_inference(self, cnn):
+    """Adds the core layers of the CNN's forward pass.
+
+    This should build the forward pass layers, except for the initial transpose
+    of the images and the final Dense layer producing the logits. The layers
+    should be build with the ConvNetBuilder `cnn`, so that when this function
+    returns, `cnn.top_layer` and `cnn.top_size` refer to the last layer and the
+    number of units of the layer layer, respectively.
+
+    Args:
+      cnn: A ConvNetBuilder to build the forward pass layers with.
+    """
+    del cnn
+    raise NotImplementedError('Must be implemented in derived classes')
+
+  def get_input_data_types(self, subset):
+    """Return data types of inputs for the specified subset."""
+    del subset  # Same types for both 'train' and 'validation' subsets.
+    return [self.data_type, tf.int32]
+
+  def get_input_shapes(self, subset):
+    """Return data shapes of inputs for the specified subset."""
+    del subset  # Same shapes for both 'train' and 'validation' subsets.
+    # Each input is of shape [batch_size, height, width, depth]
+    # Each label is of shape [batch_size]
+    return [[self.batch_size, self.image_size, self.image_size, self.depth],
+            [self.batch_size]]
+
+  def get_synthetic_inputs(self, input_name, nclass):
+    # Synthetic input should be within [0, 255].
+    image_shape, label_shape = self.get_input_shapes('train')
+    inputs = tf.truncated_normal(
+        image_shape,
+        dtype=self.data_type,
+        mean=127,
+        stddev=60,
+        name=self.model_name + '_synthetic_inputs')
+    inputs = variables_module.VariableV1(
+        inputs, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES],
+        name=input_name)
+    labels = tf.random_uniform(
+        label_shape,
+        minval=0,
+        maxval=nclass - 1,
+        dtype=tf.int32,
+        name=self.model_name + '_synthetic_labels')
+    return (inputs, labels)
+
+  def gpu_preprocess_nhwc(self, images, phase_train=True):
+    del phase_train
+    return images
+
+  def build_network(self,
+                    inputs,
+                    phase_train=True,
+                    nclass=1001):
+    """Returns logits from input images.
+
+    Args:
+      inputs: The input images and labels
+      phase_train: True during training. False during evaluation.
+      nclass: Number of classes that the images can belong to.
+
+    Returns:
+      A BuildNetworkResult which contains the logits and model-specific extra
+        information.
+    """
+    images = inputs[0]
+    images = self.gpu_preprocess_nhwc(images, phase_train)
+    if self.data_format == 'NCHW':
+      images = tf.transpose(images, [0, 3, 1, 2])
+    var_type = tf.float32
+    if self.data_type == tf.float16 and self.fp16_vars:
+      var_type = tf.float16
+    network = convnet_builder.ConvNetBuilder(
+        images, self.depth, phase_train, self.use_tf_layers, self.data_format,
+        self.data_type, var_type)
+    with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+      self.add_inference(network)
+      # Add the final fully-connected class layer
+      logits = (
+          network.affine(nclass, activation='linear')
+          if not self.skip_final_affine_layer() else network.top_layer)
+      mlperf.logger.log(key=mlperf.tags.MODEL_HP_FINAL_SHAPE,
+                        value=logits.shape.as_list()[1:])
+      aux_logits = None
+      if network.aux_top_layer is not None:
+        with network.switch_to_aux_top_layer():
+          aux_logits = network.affine(nclass, activation='linear', stddev=0.001)
+    if self.data_type == tf.float16:
+      # TODO(reedwm): Determine if we should do this cast here.
+      logits = tf.cast(logits, tf.float32)
+      if aux_logits is not None:
+        aux_logits = tf.cast(aux_logits, tf.float32)
+    return BuildNetworkResult(
+        logits=logits, extra_info=None if aux_logits is None else aux_logits)
+
+  def loss_function(self, inputs, build_network_result):
+    """Returns the op to measure the loss of the model."""
+    logits = build_network_result.logits
+    _, labels = inputs
+    # TODO(laigd): consider putting the aux logit in the Inception model,
+    # which could call super.loss_function twice, once with the normal logits
+    # and once with the aux logits.
+    aux_logits = build_network_result.extra_info
+    with tf.name_scope('xentropy'):
+      mlperf.logger.log(key=mlperf.tags.MODEL_HP_LOSS_FN, value=mlperf.tags.CCE)
+      cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+          logits=logits, labels=labels)
+      loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+    if aux_logits is not None:
+      with tf.name_scope('aux_xentropy'):
+        aux_cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+            logits=aux_logits, labels=labels)
+        aux_loss = 0.4 * tf.reduce_mean(aux_cross_entropy, name='aux_loss')
+        loss = tf.add_n([loss, aux_loss])
+    return loss
+
+  def accuracy_function(self, inputs, logits):
+    """Returns the ops to measure the accuracy of the model."""
+    _, labels = inputs
+    top_1_op = tf.reduce_sum(
+        tf.cast(tf.nn.in_top_k(logits, labels, 1), self.data_type))
+    top_5_op = tf.reduce_sum(
+        tf.cast(tf.nn.in_top_k(logits, labels, 5), self.data_type))
+    return {'top_1_accuracy': top_1_op, 'top_5_accuracy': top_5_op}
diff --git a/cv/classification/resnet50/tensorflow/models/model_config.py b/cv/classification/resnet50/tensorflow/models/model_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a31dc6233a71f7609668362a24360b74a6e2262
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/model_config.py
@@ -0,0 +1,181 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Model configurations for CNN benchmarks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import partial
+
+from models import alexnet_model
+from models import densenet_model
+from models import googlenet_model
+from models import inception_model
+from models import lenet_model
+from models import official_resnet_model
+from models import overfeat_model
+from models import resnet_model
+from models import trivial_model
+from models import vgg_model
+from models.experimental import deepspeech
+from models.experimental import official_ncf_model
+
+
+_model_name_to_imagenet_model = {
+    'vgg11': vgg_model.Vgg11Model,
+    'vgg16': vgg_model.Vgg16Model,
+    'vgg19': vgg_model.Vgg19Model,
+    'lenet': lenet_model.Lenet5Model,
+    'googlenet': googlenet_model.GooglenetModel,
+    'overfeat': overfeat_model.OverfeatModel,
+    'alexnet': alexnet_model.AlexnetModel,
+    'trivial': trivial_model.TrivialModel,
+    'inception3': inception_model.Inceptionv3Model,
+    'inception4': inception_model.Inceptionv4Model,
+    'official_resnet18_v2':
+        partial(official_resnet_model.ImagenetResnetModel, 18),
+    'official_resnet34_v2':
+        partial(official_resnet_model.ImagenetResnetModel, 34),
+    'official_resnet50_v2':
+        partial(official_resnet_model.ImagenetResnetModel, 50),
+    'official_resnet101_v2':
+        partial(official_resnet_model.ImagenetResnetModel, 101),
+    'official_resnet152_v2':
+        partial(official_resnet_model.ImagenetResnetModel, 152),
+    'official_resnet200_v2':
+        partial(official_resnet_model.ImagenetResnetModel, 200),
+    'official_resnet18':
+        partial(official_resnet_model.ImagenetResnetModel, 18, version=1),
+    'official_resnet34':
+        partial(official_resnet_model.ImagenetResnetModel, 34, version=1),
+    'official_resnet50':
+        partial(official_resnet_model.ImagenetResnetModel, 50, version=1),
+    'official_resnet101':
+        partial(official_resnet_model.ImagenetResnetModel, 101, version=1),
+    'official_resnet152':
+        partial(official_resnet_model.ImagenetResnetModel, 152, version=1),
+    'official_resnet200':
+        partial(official_resnet_model.ImagenetResnetModel, 200, version=1),
+    'resnet50': resnet_model.create_resnet50_model,
+    'resnet50_v1.5': resnet_model.create_resnet50_v1_5_model,
+    'resnet50_v2': resnet_model.create_resnet50_v2_model,
+    'resnet101': resnet_model.create_resnet101_model,
+    'resnet101_v2': resnet_model.create_resnet101_v2_model,
+    'resnet152': resnet_model.create_resnet152_model,
+    'resnet152_v2': resnet_model.create_resnet152_v2_model,
+    'ncf': official_ncf_model.NcfModel,
+}
+
+
+_model_name_to_cifar_model = {
+    'alexnet': alexnet_model.AlexnetCifar10Model,
+    'resnet20': resnet_model.create_resnet20_cifar_model,
+    'resnet20_v2': resnet_model.create_resnet20_v2_cifar_model,
+    'resnet32': resnet_model.create_resnet32_cifar_model,
+    'resnet32_v2': resnet_model.create_resnet32_v2_cifar_model,
+    'resnet44': resnet_model.create_resnet44_cifar_model,
+    'resnet44_v2': resnet_model.create_resnet44_v2_cifar_model,
+    'resnet56': resnet_model.create_resnet56_cifar_model,
+    'resnet56_v2': resnet_model.create_resnet56_v2_cifar_model,
+    'resnet110': resnet_model.create_resnet110_cifar_model,
+    'resnet110_v2': resnet_model.create_resnet110_v2_cifar_model,
+    'trivial': trivial_model.TrivialCifar10Model,
+    'densenet40_k12': densenet_model.create_densenet40_k12_model,
+    'densenet100_k12': densenet_model.create_densenet100_k12_model,
+    'densenet100_k24': densenet_model.create_densenet100_k24_model,
+}
+
+
+_model_name_to_object_detection_model = {
+    'trivial': trivial_model.TrivialSSD300Model,
+}
+
+
+def _get_model_map(dataset_name):
+  """Get name to model map for specified dataset."""
+  if dataset_name == 'cifar10':
+    return _model_name_to_cifar_model
+  elif dataset_name in ('imagenet', 'synthetic', 'imagenette'):
+    return _model_name_to_imagenet_model
+  elif dataset_name == 'librispeech':
+    return {'deepspeech2': deepspeech.DeepSpeech2Model}
+  elif dataset_name == 'coco':
+    return _model_name_to_object_detection_model
+  else:
+    raise ValueError('Invalid dataset name: %s' % dataset_name)
+
+
+# A model map dict can have this string as a value when TF2 is used, to indicate
+# the model is only available in TF1.
+_TF1_ONLY_STRING = 'TF1_ONLY'
+
+
+def get_model_config(model_name, dataset, params):
+  """Map model name to model network configuration."""
+  model_map = _get_model_map(dataset.name)
+  if model_name not in model_map:
+    raise ValueError('Invalid model name \'%s\' for dataset \'%s\'' %
+                     (model_name, dataset.name))
+  model = model_map[model_name](params=params)
+  if model == 'TF1_ONLY':
+    raise ValueError('Model \'%s\' can only be used with TensorFlow 1'
+                     % (model_name,))
+  return model
+
+
+def register_model(model_name, dataset_name, model_func):
+  """Register a new model that can be obtained with `get_model_config`."""
+  model_map = _get_model_map(dataset_name)
+  if model_name in model_map:
+    raise ValueError('Model "%s" is already registered for dataset "%s"' %
+                     (model_name, dataset_name))
+  model_map[model_name] = model_func
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  from tensorflow.contrib import slim  # pylint: disable=unused-import
+  can_import_contrib = True
+except ImportError:
+  can_import_contrib = False
+
+
+def register_tf1_models():
+  """Registers all the TensorFlow 1-only models.
+
+  TF 1-only models use contrib, which was removed in TF 2. If contrib can be
+  imported, the TF 1-only models are registered normally. If contrib cannot be
+  imported, the models are registered with the 'TF1_ONLY' string instead, which
+  will cause an error to be thrown if these models are used.
+  """
+  if can_import_contrib:
+    from models.tf1_only import mobilenet_v2
+    from models.tf1_only import nasnet_model
+    from models.tf1_only import ssd_model
+    register_model('mobilenet', 'imagenet', mobilenet_v2.MobilenetModel)
+    register_model('nasnet', 'imagenet', nasnet_model.NasnetModel)
+    register_model('nasnetlarge', 'imagenet', nasnet_model.NasnetLargeModel)
+    register_model('nasnet', 'cifar10', nasnet_model.NasnetCifarModel)
+    register_model('ssd300', 'coco', ssd_model.SSD300Model)
+  else:
+    register_model('mobilenet', 'imagenet', 'TF1_ONLY')
+    register_model('nasnet', 'imagenet', 'TF1_ONLY')
+    register_model('nasnetlarge', 'imagenet', 'TF1_ONLY')
+    register_model('nasnet', 'cifar10', 'TF1_ONLY')
+    register_model('ssd300', 'coco', 'TF1_ONLY')
+
diff --git a/cv/classification/resnet50/tensorflow/models/official_resnet_model.py b/cv/classification/resnet50/tensorflow/models/official_resnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a70943c644550fe1a092b20e2c9a9f63cd797623
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/official_resnet_model.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import official resnet models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+import datasets
+from models import model as model_lib
+
+
+class ImagenetResnetModel(model_lib.CNNModel):
+  """Official resnet models."""
+
+  def __init__(self, resnet_size, version=2, params=None):
+    """These are the parameters that work for Imagenet data.
+
+    Args:
+      resnet_size: The number of convolutional layers needed in the model.
+      version: 1 or 2 for v1 or v2, respectively.
+      params: params passed by BenchmarkCNN.
+    """
+    default_batch_sizes = {
+        50: 128,
+        101: 32,
+        152: 32
+    }
+    batch_size = default_batch_sizes.get(resnet_size, 32)
+    default_learning_rate = 0.0125 * batch_size / 32
+    model_name = 'official_resnet_{}_v{}'.format(resnet_size, version)
+    super(ImagenetResnetModel, self).__init__(
+        model_name, 224, batch_size, default_learning_rate, params=params)
+    self.resnet_size = resnet_size
+    self.version = version
+
+  def get_learning_rate(self, global_step, batch_size):
+    num_batches_per_epoch = (
+        float(datasets.IMAGENET_NUM_TRAIN_IMAGES) / batch_size)
+    boundaries = [int(num_batches_per_epoch * x) for x in [30, 60, 80, 90]]
+    values = [1, 0.1, 0.01, 0.001, 0.0001]
+    adjusted_learning_rate = (
+        self.learning_rate / self.default_batch_size * batch_size)
+    values = [v * adjusted_learning_rate for v in values]
+    return tf.train.piecewise_constant(global_step, boundaries, values)
+
+  def build_network(self, images, phase_train=True, nclass=1001,
+                    data_type=tf.float32):
+    # pylint: disable=g-import-not-at-top
+    try:
+      from official.resnet.r1.imagenet_main import ImagenetModel
+    except ImportError:
+      tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.')
+      raise
+    images = tf.cast(images, data_type)
+    model_class = ImagenetModel(resnet_size=self.resnet_size,
+                                resnet_version=self.version,
+                                # The official model dtype seems to be ignored,
+                                # as the dtype it uses is the dtype of the input
+                                # images. Doesn't hurt to set it though.
+                                dtype=data_type)
+    logits = model_class(images, phase_train)
+    logits = tf.cast(logits, tf.float32)
+    return model_lib.BuildNetworkResult(logits=logits, extra_info=None)
diff --git a/cv/classification/resnet50/tensorflow/models/overfeat_model.py b/cv/classification/resnet50/tensorflow/models/overfeat_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7483bcbf3221f719e31baad4b9c93a4f52b0f629
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/overfeat_model.py
@@ -0,0 +1,53 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Overfeat model configuration.
+
+References:
+  OverFeat: Integrated Recognition, Localization and Detection using
+  Convolutional Networks
+  Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus,
+  Yann LeCun, 2014
+  http://arxiv.org/abs/1312.6229
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from models import model
+
+
+class OverfeatModel(model.CNNModel):
+  """OverfeatModel."""
+
+  def __init__(self, params=None):
+    super(OverfeatModel, self).__init__(
+        'overfeat', 231, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    # Note: VALID requires padding the images by 3 in width and height
+    cnn.conv(96, 11, 11, 4, 4, mode='VALID')
+    cnn.mpool(2, 2)
+    cnn.conv(256, 5, 5, 1, 1, mode='VALID')
+    cnn.mpool(2, 2)
+    cnn.conv(512, 3, 3)
+    cnn.conv(1024, 3, 3)
+    cnn.conv(1024, 3, 3)
+    cnn.mpool(2, 2)
+    cnn.reshape([-1, 1024 * 6 * 6])
+    cnn.affine(3072)
+    cnn.dropout()
+    cnn.affine(4096)
+    cnn.dropout()
diff --git a/cv/classification/resnet50/tensorflow/models/resnet_model.py b/cv/classification/resnet50/tensorflow/models/resnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6340a30b89b661ea884df849e6c0949a2c7b9c86
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/resnet_model.py
@@ -0,0 +1,489 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Resnet model configuration.
+
+References:
+  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+  Deep Residual Learning for Image Recognition
+  arXiv:1512.03385 (2015)
+
+  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+  Identity Mappings in Deep Residual Networks
+  arXiv:1603.05027 (2016)
+
+  Liang-Chieh Chen, George Papandreou, Iasonas Kokkinos, Kevin Murphy,
+  Alan L. Yuille
+  DeepLab: Semantic Image Segmentation with Deep Convolutional Nets,
+  Atrous Convolution, and Fully Connected CRFs
+  arXiv:1606.00915 (2016)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+import datasets
+import mlperf
+from models import model as model_lib
+
+
+def bottleneck_block_v1(cnn, depth, depth_bottleneck, stride):
+  """Bottleneck block with identity short-cut for ResNet v1.
+
+  Args:
+    cnn: the network to append bottleneck blocks.
+    depth: the number of output filters for this bottleneck block.
+    depth_bottleneck: the number of bottleneck filters for this block.
+    stride: Stride used in the first layer of the bottleneck block.
+  """
+  input_layer = cnn.top_layer
+  in_size = cnn.top_size
+  name_key = 'resnet_v1'
+  name = name_key + str(cnn.counts[name_key])
+  cnn.counts[name_key] += 1
+
+  with tf.variable_scope(name):
+    if depth == in_size:
+      if stride == 1:
+        shortcut = input_layer
+      else:
+        shortcut = cnn.apool(
+            1, 1, stride, stride, input_layer=input_layer,
+            num_channels_in=in_size)
+        mlperf.logger.log_projection(input_tensor=input_layer,
+                                     output_tensor=shortcut)
+    else:
+      shortcut = cnn.conv(
+          depth, 1, 1, stride, stride, activation=None,
+          use_batch_norm=True, input_layer=input_layer,
+          num_channels_in=in_size, bias=None)
+    cnn.conv(depth_bottleneck, 1, 1, stride, stride,
+             input_layer=input_layer, num_channels_in=in_size,
+             use_batch_norm=True, bias=None)
+    cnn.conv(depth_bottleneck, 3, 3, 1, 1, mode='SAME_RESNET',
+             use_batch_norm=True, bias=None)
+    res = cnn.conv(depth, 1, 1, 1, 1, activation=None,
+                   use_batch_norm=True, bias=None)
+    mlperf.logger.log(key=mlperf.tags.MODEL_HP_SHORTCUT_ADD)
+    mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
+    output = tf.nn.relu(shortcut + res)
+    cnn.top_layer = output
+    cnn.top_size = depth
+
+
+def bottleneck_block_v1_5(cnn, depth, depth_bottleneck, stride):
+  """Bottleneck block with identity short-cut for ResNet v1.5.
+
+  ResNet v1.5 is the informal name for ResNet v1 where stride 2 is used in the
+  first 3x3 convolution of each block instead of the first 1x1 convolution.
+
+  First seen at https://github.com/facebook/fb.resnet.torch. Used in the paper
+  "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour"
+  (arXiv:1706.02677v2) and by fast.ai to train to accuracy in 45 epochs using
+  multiple image sizes.
+
+  Args:
+    cnn: the network to append bottleneck blocks.
+    depth: the number of output filters for this bottleneck block.
+    depth_bottleneck: the number of bottleneck filters for this block.
+    stride: Stride used in the first layer of the bottleneck block.
+  """
+  input_layer = cnn.top_layer
+  in_size = cnn.top_size
+  name_key = 'resnet_v1.5'
+  name = name_key + str(cnn.counts[name_key])
+  cnn.counts[name_key] += 1
+
+  with tf.variable_scope(name):
+    if depth == in_size:
+      if stride == 1:
+        shortcut = input_layer
+      else:
+        shortcut = cnn.apool(
+            1, 1, stride, stride, input_layer=input_layer,
+            num_channels_in=in_size)
+        mlperf.logger.log_projection(input_tensor=input_layer,
+                                     output_tensor=shortcut)
+    else:
+      shortcut = cnn.conv(
+          depth, 1, 1, stride, stride, activation=None,
+          use_batch_norm=True, input_layer=input_layer,
+          num_channels_in=in_size, bias=None)
+      mlperf.logger.log_projection(input_tensor=input_layer,
+                                   output_tensor=shortcut)
+    cnn.conv(depth_bottleneck, 1, 1, 1, 1,
+             input_layer=input_layer, num_channels_in=in_size,
+             use_batch_norm=True, bias=None)
+    cnn.conv(depth_bottleneck, 3, 3, stride, stride, mode='SAME_RESNET',
+             use_batch_norm=True, bias=None)
+    res = cnn.conv(depth, 1, 1, 1, 1, activation=None,
+                   use_batch_norm=True, bias=None)
+    mlperf.logger.log(key=mlperf.tags.MODEL_HP_SHORTCUT_ADD)
+    mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
+    output = tf.nn.relu(shortcut + res)
+    cnn.top_layer = output
+    cnn.top_size = depth
+
+
+def bottleneck_block_v2(cnn, depth, depth_bottleneck, stride):
+  """Bottleneck block with identity short-cut for ResNet v2.
+
+  The main difference from v1 is that a batch norm and relu are done at the
+  start of the block, instead of the end. This initial batch norm and relu is
+  collectively called a pre-activation.
+
+  Args:
+    cnn: the network to append bottleneck blocks.
+    depth: the number of output filters for this bottleneck block.
+    depth_bottleneck: the number of bottleneck filters for this block.
+    stride: Stride used in the first layer of the bottleneck block.
+  """
+  input_layer = cnn.top_layer
+  in_size = cnn.top_size
+  name_key = 'resnet_v2'
+  name = name_key + str(cnn.counts[name_key])
+  cnn.counts[name_key] += 1
+
+  preact = cnn.batch_norm()
+  mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
+  preact = tf.nn.relu(preact)
+  with tf.variable_scope(name):
+    if depth == in_size:
+      if stride == 1:
+        shortcut = input_layer
+      else:
+        shortcut = cnn.apool(
+            1, 1, stride, stride, input_layer=input_layer,
+            num_channels_in=in_size)
+        mlperf.logger.log_projection(input_tensor=input_layer,
+                                     output_tensor=shortcut)
+    else:
+      shortcut = cnn.conv(
+          depth, 1, 1, stride, stride, activation=None, use_batch_norm=False,
+          input_layer=preact, num_channels_in=in_size, bias=None)
+    cnn.conv(depth_bottleneck, 1, 1, stride, stride,
+             input_layer=preact, num_channels_in=in_size,
+             use_batch_norm=True, bias=None)
+    cnn.conv(depth_bottleneck, 3, 3, 1, 1, mode='SAME_RESNET',
+             use_batch_norm=True, bias=None)
+    res = cnn.conv(depth, 1, 1, 1, 1, activation=None,
+                   use_batch_norm=False, bias=None)
+    mlperf.logger.log(key=mlperf.tags.MODEL_HP_SHORTCUT_ADD)
+    output = shortcut + res
+    cnn.top_layer = output
+    cnn.top_size = depth
+
+
+def bottleneck_block(cnn, depth, depth_bottleneck, stride, version):
+  """Bottleneck block with identity short-cut.
+
+  Args:
+    cnn: the network to append bottleneck blocks.
+    depth: the number of output filters for this bottleneck block.
+    depth_bottleneck: the number of bottleneck filters for this block.
+    stride: Stride used in the first layer of the bottleneck block.
+    version: version of ResNet to build.
+  """
+  mlperf.logger.log(key=mlperf.tags.MODEL_HP_BLOCK_TYPE,
+                    value=mlperf.tags.BOTTLENECK_BLOCK)
+  mlperf.logger.log_begin_block(
+      input_tensor=cnn.top_layer, block_type=mlperf.tags.BOTTLENECK_BLOCK)
+  if version == 'v2':
+    bottleneck_block_v2(cnn, depth, depth_bottleneck, stride)
+  elif version == 'v1.5':
+    bottleneck_block_v1_5(cnn, depth, depth_bottleneck, stride)
+  else:
+    bottleneck_block_v1(cnn, depth, depth_bottleneck, stride)
+  mlperf.logger.log_end_block(output_tensor=cnn.top_layer)
+
+
+def residual_block(cnn, depth, stride, version, projection_shortcut=False):
+  """Residual block with identity short-cut.
+
+  Args:
+    cnn: the network to append residual blocks.
+    depth: the number of output filters for this residual block.
+    stride: Stride used in the first layer of the residual block.
+    version: version of ResNet to build.
+    projection_shortcut: indicator of using projection shortcut, even if top
+      size and depth are equal
+  """
+  pre_activation = True if version == 'v2' else False
+  input_layer = cnn.top_layer
+  in_size = cnn.top_size
+
+  if projection_shortcut:
+    shortcut = cnn.conv(
+        depth, 1, 1, stride, stride, activation=None,
+        use_batch_norm=True, input_layer=input_layer,
+        num_channels_in=in_size, bias=None)
+  elif in_size != depth:
+    # Plan A of shortcut.
+    shortcut = cnn.apool(1, 1, stride, stride,
+                         input_layer=input_layer,
+                         num_channels_in=in_size)
+    padding = (depth - in_size) // 2
+    if cnn.channel_pos == 'channels_last':
+      shortcut = tf.pad(
+          shortcut, [[0, 0], [0, 0], [0, 0], [padding, padding]])
+    else:
+      shortcut = tf.pad(
+          shortcut, [[0, 0], [padding, padding], [0, 0], [0, 0]])
+  else:
+    shortcut = input_layer
+  if pre_activation:
+    res = cnn.batch_norm(input_layer)
+    res = tf.nn.relu(res)
+  else:
+    res = input_layer
+  cnn.conv(depth, 3, 3, stride, stride,
+           input_layer=res, num_channels_in=in_size,
+           use_batch_norm=True, bias=None)
+  if pre_activation:
+    res = cnn.conv(depth, 3, 3, 1, 1, activation=None,
+                   use_batch_norm=False, bias=None)
+    output = shortcut + res
+  else:
+    res = cnn.conv(depth, 3, 3, 1, 1, activation=None,
+                   use_batch_norm=True, bias=None)
+    output = tf.nn.relu(shortcut + res)
+  cnn.top_layer = output
+  cnn.top_size = depth
+
+
+class ResnetModel(model_lib.CNNModel):
+  """Resnet cnn network configuration."""
+
+  def __init__(self, model, layer_counts, params=None):
+    default_batch_sizes = {
+        'resnet50': 64,
+        'resnet101': 32,
+        'resnet152': 32,
+        'resnet50_v1.5': 64,
+        'resnet101_v1.5': 32,
+        'resnet152_v1.5': 32,
+        'resnet50_v2': 64,
+        'resnet101_v2': 32,
+        'resnet152_v2': 32,
+    }
+    batch_size = default_batch_sizes.get(model, 32)
+    # The ResNet paper uses a starting lr of .1 at bs=256.
+    self.base_lr_batch_size = 256
+    base_lr = 0.128
+    if params:
+      if params.resnet_base_lr:
+        base_lr = params.resnet_base_lr
+      if params.use_deep_stem:
+        self.use_deep_stem = True
+      else:
+        self.use_deep_stem = False
+    super(ResnetModel, self).__init__(model, 224, batch_size, base_lr,
+                                      layer_counts, params=params)
+    if 'v2' in model:
+      self.version = 'v2'
+    elif 'v1.5' in model:
+      self.version = 'v1.5'
+    else:
+      self.version = 'v1'
+
+  def add_inference(self, cnn):
+    if self.layer_counts is None:
+      raise ValueError('Layer counts not specified for %s' % self.get_model())
+    # Drop batch size from shape logging.
+    mlperf.logger.log(key=mlperf.tags.MODEL_HP_INITIAL_SHAPE,
+                      value=cnn.top_layer.shape.as_list()[1:])
+    cnn.use_batch_norm = True
+    cnn.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True}
+    if self.use_deep_stem:
+      cnn.conv(32, 3, 3, 2, 2, mode='SAME_RESNET', use_batch_norm=True)
+      cnn.conv(32, 3, 3, 1, 1, mode='SAME_RESNET', use_batch_norm=True)
+      cnn.conv(64, 3, 3, 1, 1, mode='SAME_RESNET', use_batch_norm=True)
+    else:
+      cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+    for _ in xrange(self.layer_counts[0]):
+      bottleneck_block(cnn, 256, 64, 1, self.version)
+    for i in xrange(self.layer_counts[1]):
+      stride = 2 if i == 0 else 1
+      bottleneck_block(cnn, 512, 128, stride, self.version)
+    for i in xrange(self.layer_counts[2]):
+      stride = 2 if i == 0 else 1
+      bottleneck_block(cnn, 1024, 256, stride, self.version)
+    for i in xrange(self.layer_counts[3]):
+      stride = 2 if i == 0 else 1
+      bottleneck_block(cnn, 2048, 512, stride, self.version)
+    if self.version == 'v2':
+      cnn.batch_norm()
+      cnn.top_layer = tf.nn.relu(cnn.top_layer)
+    cnn.spatial_mean()
+
+  def get_learning_rate(self, global_step, batch_size):
+    rescaled_lr = self.get_scaled_base_learning_rate(batch_size)
+    num_batches_per_epoch = (
+        datasets.IMAGENET_NUM_TRAIN_IMAGES / batch_size)
+    boundaries = [int(num_batches_per_epoch * x) for x in [30, 60, 80, 90]]
+    values = [1, 0.1, 0.01, 0.001, 0.0001]
+    values = [rescaled_lr * v for v in values]
+    lr = tf.train.piecewise_constant(global_step, boundaries, values)
+    warmup_steps = int(num_batches_per_epoch * 5)
+    mlperf.logger.log(key=mlperf.tags.OPT_LR_WARMUP_STEPS, value=warmup_steps)
+    warmup_lr = (
+        rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast(
+            warmup_steps, tf.float32))
+    return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
+
+  def get_scaled_base_learning_rate(self, batch_size):
+    """Calculates base learning rate for creating lr schedule.
+
+    In replicated mode, gradients are summed rather than averaged which, with
+    the sgd and momentum optimizers, increases the effective learning rate by
+    lr * num_gpus. Dividing the base lr by num_gpus negates the increase.
+
+    Args:
+      batch_size: Total batch-size.
+
+    Returns:
+      Base learning rate to use to create lr schedule.
+    """
+    base_lr = self.learning_rate
+    if self.params.variable_update == 'replicated':
+      base_lr = self.learning_rate / self.params.num_gpus
+    scaled_lr = base_lr * (batch_size / self.base_lr_batch_size)
+    return scaled_lr
+
+
+def create_resnet50_model(params):
+  return ResnetModel('resnet50', (3, 4, 6, 3), params=params)
+
+
+def create_resnet50_v1_5_model(params):
+  return ResnetModel('resnet50_v1.5', (3, 4, 6, 3), params=params)
+
+
+def create_resnet50_v2_model(params):
+  return ResnetModel('resnet50_v2', (3, 4, 6, 3), params=params)
+
+
+def create_resnet101_model(params):
+  return ResnetModel('resnet101', (3, 4, 23, 3), params=params)
+
+
+def create_resnet101_v2_model(params):
+  return ResnetModel('resnet101_v2', (3, 4, 23, 3), params=params)
+
+
+def create_resnet152_model(params):
+  return ResnetModel('resnet152', (3, 8, 36, 3), params=params)
+
+
+def create_resnet152_v2_model(params):
+  return ResnetModel('resnet152_v2', (3, 8, 36, 3), params=params)
+
+
+class ResnetCifar10Model(model_lib.CNNModel):
+  """Resnet cnn network configuration for Cifar 10 dataset.
+
+  V1 model architecture follows the one defined in the paper:
+  https://arxiv.org/pdf/1512.03385.pdf.
+
+  V2 model architecture follows the one defined in the paper:
+  https://arxiv.org/pdf/1603.05027.pdf.
+  """
+
+  def __init__(self, model, layer_counts, params=None):
+    if 'v2' in model:
+      self.version = 'v2'
+    else:
+      self.version = 'v1'
+    super(ResnetCifar10Model, self).__init__(
+        model, 32, 128, 0.1, layer_counts, params=params)
+
+  def add_inference(self, cnn):
+    if self.layer_counts is None:
+      raise ValueError('Layer counts not specified for %s' % self.get_model())
+
+    cnn.use_batch_norm = True
+    cnn.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True}
+    if self.version == 'v2':
+      cnn.conv(16, 3, 3, 1, 1, use_batch_norm=True)
+    else:
+      cnn.conv(16, 3, 3, 1, 1, activation=None, use_batch_norm=True)
+    for i in xrange(self.layer_counts[0]):
+      # reshape to batch_size x 16 x 32 x 32
+      residual_block(cnn, 16, 1, self.version)
+    for i in xrange(self.layer_counts[1]):
+      # Subsampling is performed at the first convolution with a stride of 2
+      stride = 2 if i == 0 else 1
+      # reshape to batch_size x 32 x 16 x 16
+      residual_block(cnn, 32, stride, self.version)
+    for i in xrange(self.layer_counts[2]):
+      stride = 2 if i == 0 else 1
+      # reshape to batch_size x 64 x 8 x 8
+      residual_block(cnn, 64, stride, self.version)
+    if self.version == 'v2':
+      cnn.batch_norm()
+      cnn.top_layer = tf.nn.relu(cnn.top_layer)
+    cnn.spatial_mean()
+
+  def get_learning_rate(self, global_step, batch_size):
+    num_batches_per_epoch = int(50000 / batch_size)
+    boundaries = num_batches_per_epoch * np.array([82, 123, 300],
+                                                  dtype=np.int64)
+    boundaries = [x for x in boundaries]
+    values = [0.1, 0.01, 0.001, 0.0002]
+    return tf.train.piecewise_constant(global_step, boundaries, values)
+
+
+def create_resnet20_cifar_model(params):
+  return ResnetCifar10Model('resnet20', (3, 3, 3), params=params)
+
+
+def create_resnet20_v2_cifar_model(params):
+  return ResnetCifar10Model('resnet20_v2', (3, 3, 3), params=params)
+
+
+def create_resnet32_cifar_model(params):
+  return ResnetCifar10Model('resnet32', (5, 5, 5), params=params)
+
+
+def create_resnet32_v2_cifar_model(params):
+  return ResnetCifar10Model('resnet32_v2', (5, 5, 5), params=params)
+
+
+def create_resnet44_cifar_model(params):
+  return ResnetCifar10Model('resnet44', (7, 7, 7), params=params)
+
+
+def create_resnet44_v2_cifar_model(params):
+  return ResnetCifar10Model('resnet44_v2', (7, 7, 7), params=params)
+
+
+def create_resnet56_cifar_model(params):
+  return ResnetCifar10Model('resnet56', (9, 9, 9), params=params)
+
+
+def create_resnet56_v2_cifar_model(params):
+  return ResnetCifar10Model('resnet56_v2', (9, 9, 9), params=params)
+
+
+def create_resnet110_cifar_model(params):
+  return ResnetCifar10Model('resnet110', (18, 18, 18), params=params)
+
+
+def create_resnet110_v2_cifar_model(params):
+  return ResnetCifar10Model('resnet110_v2', (18, 18, 18), params=params)
diff --git a/cv/classification/resnet50/tensorflow/models/resnet_model_test.py b/cv/classification/resnet50/tensorflow/models/resnet_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4052fcd2e996c7f02458b6754dfa6dd52635a94
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/resnet_model_test.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for resnet_model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import mock
+import tensorflow.compat.v1 as tf
+
+from models import resnet_model
+
+
+class ResNetModelTest(tf.test.TestCase):
+
+  def testGetScaledBaseLearningRateOneGpuLrFromParams(self):
+    """Verifies setting params.resnet_base_lr pipes through."""
+    lr = self._get_scaled_base_learning_rate(1,
+                                             'parameter_server',
+                                             256,
+                                             base_lr=.050)
+    self.assertEqual(lr, .050)
+
+  def testGetScaledBaseLearningRateOneGpu(self):
+    lr = self._get_scaled_base_learning_rate(1, 'parameter_server', 128)
+    self.assertEqual(lr, .064)
+
+  def testGetScaledBaseLearningRateEightGpuReplicated(self):
+    lr = self._get_scaled_base_learning_rate(8, 'replicated', 256 * 8)
+    self.assertEqual(lr, .128)
+
+  def testGetScaledBaseLearningRateTwoGpuParameter(self):
+    lr = self._get_scaled_base_learning_rate(2, 'parameter_server', 256 * 2)
+    self.assertEqual(lr, .256)
+
+  def testGetScaledBaseLearningRateTwoGpuUneven(self):
+    lr = self._get_scaled_base_learning_rate(2, 'replicated', 13)
+    self.assertEqual(lr, 0.0032500000000000003)
+
+  def _get_scaled_base_learning_rate(self,
+                                     num_gpus,
+                                     variable_update,
+                                     batch_size,
+                                     base_lr=None):
+    """Simplifies testing different learning rate calculations.
+
+    Args:
+      num_gpus: Number of GPUs to be used.
+      variable_update: Type of variable update used.
+      batch_size: Total batch size.
+      base_lr: Base learning rate before scaling.
+
+    Returns:
+      Base learning rate that would be used to create lr schedule.
+    """
+    params = mock.Mock()
+    params.num_gpus = num_gpus
+    params.variable_update = variable_update
+    if base_lr:
+      params.resnet_base_lr = base_lr
+    resnet50_model = resnet_model.ResnetModel('resnet50', 50, params=params)
+    return resnet50_model.get_scaled_base_learning_rate(batch_size)
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/__init__.py b/cv/classification/resnet50/tensorflow/models/tf1_only/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet.py b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1c2275a51635ae670e753fa8f9952f178fbef94
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet.py
@@ -0,0 +1,467 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mobilenet Base Class, branched from slim for fp16 performance study."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import contextlib
+import copy
+import os
+
+import tensorflow.compat.v1 as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+
+@slim.add_arg_scope
+def apply_activation(x, name=None, activation_fn=None):
+  return activation_fn(x, name=name) if activation_fn else x
+
+
+def _fixed_padding(inputs, kernel_size, rate=1):
+  """Pads the input along the spatial dimensions independently of input size.
+
+  Pads the input such that if it was used in a convolution with 'VALID' padding,
+  the output would have the same dimensions as if the unpadded input was used
+  in a convolution with 'SAME' padding.
+
+  Args:
+    inputs: A tensor of size [batch, height_in, width_in, channels].
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+    rate: An integer, rate for atrous convolution.
+
+  Returns:
+    output: A tensor of size [batch, height_out, width_out, channels] with the
+      input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+  kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
+                           kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
+  pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
+  pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
+  pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
+  padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
+                                  [pad_beg[1], pad_end[1]], [0, 0]])
+  return padded_inputs
+
+
+def _make_divisible(v, divisor, min_value=None):
+  if min_value is None:
+    min_value = divisor
+  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_v < 0.9 * v:
+    new_v += divisor
+  return new_v
+
+
+@contextlib.contextmanager
+def _set_arg_scope_defaults(defaults):
+  """Sets arg scope defaults for all items present in defaults.
+
+  Args:
+    defaults: dictionary/list of pairs, containing a mapping from
+    function to a dictionary of default args.
+
+  Yields:
+    context manager where all defaults are set.
+  """
+  if hasattr(defaults, 'items'):
+    items = list(defaults.items())
+  else:
+    items = defaults
+  if not items:
+    yield
+  else:
+    func, default_arg = items[0]
+    with slim.arg_scope(func, **default_arg):
+      with _set_arg_scope_defaults(items[1:]):
+        yield
+
+
+@slim.add_arg_scope
+def depth_multiplier(output_params,
+                     multiplier,
+                     divisible_by=8,
+                     min_depth=8,
+                     **unused_kwargs):
+  if 'num_outputs' not in output_params:
+    return
+  d = output_params['num_outputs']
+  output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by,
+                                                 min_depth)
+
+
+_Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func'])
+
+
+def op(opfunc, **params):
+  multiplier = params.pop('multiplier_transorm', depth_multiplier)
+  return _Op(opfunc, params=params, multiplier_func=multiplier)
+
+
+class NoOpScope(object):
+  """No-op context manager."""
+
+  def __enter__(self):
+    return
+
+  def __exit__(self, exc_type, exc_value, traceback):
+    return False
+
+
+def safe_arg_scope(funcs, **kwargs):
+  """Returns `slim.arg_scope` with all None arguments removed.
+
+  Arguments:
+    funcs: Functions to pass to `arg_scope`.
+    **kwargs: Arguments to pass to `arg_scope`.
+
+  Returns:
+    arg_scope or No-op context manager.
+
+  Note: can be useful if None value should be interpreted as "do not overwrite
+    this parameter value".
+  """
+  filtered_args = {name: value for name, value in kwargs.items()
+                   if value is not None}
+  if filtered_args:
+    return slim.arg_scope(funcs, **filtered_args)
+  else:
+    return NoOpScope()
+
+
+@slim.add_arg_scope
+def mobilenet_base(  # pylint: disable=invalid-name
+    inputs,
+    conv_defs,
+    multiplier=1.0,
+    final_endpoint=None,
+    output_stride=None,
+    use_explicit_padding=False,
+    scope=None,
+    is_training=False):
+  """Mobilenet base network.
+
+  Constructs a network from inputs to the given final endpoint. By default
+  the network is constructed in inference mode. To create network
+  in training mode use:
+
+  with slim.arg_scope(mobilenet.training_scope()):
+     logits, endpoints = mobilenet_base(...)
+
+  Args:
+    inputs: a tensor of shape [batch_size, height, width, channels].
+    conv_defs: A list of op(...) layers specifying the net architecture.
+    multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    final_endpoint: The name of last layer, for early termination for
+    for V1-based networks: last layer is "layer_14", for V2: "layer_20"
+    output_stride: An integer that specifies the requested ratio of input to
+      output spatial resolution. If not None, then we invoke atrous convolution
+      if necessary to prevent the network from reducing the spatial resolution
+      of the activation maps. Allowed values are 1 or any even number, excluding
+      zero. Typical values are 8 (accurate fully convolutional mode), 16
+      (fast fully convolutional mode), and 32 (classification mode).
+
+      NOTE- output_stride relies on all consequent operators to support dilated
+      operators via "rate" parameter. This might require wrapping non-conv
+      operators to operate properly.
+
+    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+      inputs so that the output dimensions are the same as if 'SAME' padding
+      were used.
+    scope: optional variable scope.
+    is_training: How to setup batch_norm and other ops. Note: most of the time
+      this does not need be set directly. Use mobilenet.training_scope() to set
+      up training instead. This parameter is here for backward compatibility
+      only. It is safe to set it to the value matching
+      training_scope(is_training=...). It is also safe to explicitly set
+      it to False, even if there is outer training_scope set to to training.
+      (The network will be built in inference mode). If this is set to None,
+      no arg_scope is added for slim.batch_norm's is_training parameter.
+
+  Returns:
+    tensor_out: output tensor.
+    end_points: a set of activations for external use, for example summaries or
+                losses.
+
+  Raises:
+    ValueError: depth_multiplier <= 0, or the target output_stride is not
+                allowed.
+  """
+  if multiplier <= 0:
+    raise ValueError('multiplier is not greater than zero.')
+
+  # Set conv defs defaults and overrides.
+  conv_defs_defaults = conv_defs.get('defaults', {})
+  conv_defs_overrides = conv_defs.get('overrides', {})
+  if use_explicit_padding:
+    conv_defs_overrides = copy.deepcopy(conv_defs_overrides)
+    conv_defs_overrides[
+        (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'}
+
+  if output_stride is not None:
+    if output_stride == 0 or (output_stride > 1 and output_stride % 2):
+      raise ValueError('Output stride must be None, 1 or a multiple of 2.')
+
+  # a) Set the tensorflow scope
+  # b) set padding to default: note we might consider removing this
+  # since it is also set by mobilenet_scope
+  # c) set all defaults
+  # d) set all extra overrides.
+  with _scope_all(scope, default_scope='Mobilenet'), \
+      safe_arg_scope([slim.batch_norm], is_training=is_training), \
+      _set_arg_scope_defaults(conv_defs_defaults), \
+      _set_arg_scope_defaults(conv_defs_overrides):
+    # The current_stride variable keeps track of the output stride of the
+    # activations, i.e., the running product of convolution strides up to the
+    # current network layer. This allows us to invoke atrous convolution
+    # whenever applying the next convolution would result in the activations
+    # having output stride larger than the target output_stride.
+    current_stride = 1
+
+    # The atrous convolution rate parameter.
+    rate = 1
+
+    net = inputs
+    # Insert default parameters before the base scope which includes
+    # any custom overrides set in mobilenet.
+    end_points = {}
+    scopes = {}
+    for i, opdef in enumerate(conv_defs['spec']):
+      params = dict(opdef.params)
+      opdef.multiplier_func(params, multiplier)
+      stride = params.get('stride', 1)
+      if output_stride is not None and current_stride == output_stride:
+        # If we have reached the target output_stride, then we need to employ
+        # atrous convolution with stride=1 and multiply the atrous rate by the
+        # current unit's stride for use in subsequent layers.
+        layer_stride = 1
+        layer_rate = rate
+        rate *= stride
+      else:
+        layer_stride = stride
+        layer_rate = 1
+        current_stride *= stride
+      # Update params.
+      params['stride'] = layer_stride
+      # Only insert rate to params if rate > 1.
+      if layer_rate > 1:
+        params['rate'] = layer_rate
+      # Set padding
+      if use_explicit_padding:
+        if 'kernel_size' in params:
+          net = _fixed_padding(net, params['kernel_size'], layer_rate)
+        else:
+          params['use_explicit_padding'] = True
+
+      end_point = 'layer_%d' % (i + 1)
+      try:
+        net = opdef.op(net, **params)
+      except Exception:
+        print('Failed to create op %i: %r params: %r' % (i, opdef, params))
+        raise
+      end_points[end_point] = net
+      scope = os.path.dirname(net.name)
+      scopes[scope] = end_point
+      if final_endpoint is not None and end_point == final_endpoint:
+        break
+
+    # Add all tensors that end with 'output' to
+    # endpoints
+    for t in net.graph.get_operations():
+      scope = os.path.dirname(t.name)
+      bn = os.path.basename(t.name)
+      if scope in scopes and t.name.endswith('output'):
+        end_points[scopes[scope] + '/' + bn] = t.outputs[0]
+    return net, end_points
+
+
+@contextlib.contextmanager
+def _scope_all(scope, default_scope=None):
+  with tf.variable_scope(scope, default_name=default_scope) as s,\
+       tf.name_scope(s.original_name_scope):
+    yield s
+
+
+@slim.add_arg_scope
+def mobilenet(inputs,
+              num_classes=1001,
+              prediction_fn=slim.softmax,
+              reuse=None,
+              scope='Mobilenet',
+              base_only=False,
+              **mobilenet_args):
+  """Mobilenet model for classification, supports both V1 and V2.
+
+  Note: default mode is inference, use mobilenet.training_scope to create
+  training network.
+
+
+  Args:
+    inputs: a tensor of shape [batch_size, height, width, channels].
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before dropout)
+      are returned instead.
+    prediction_fn: a function to get predictions out of logits
+      (default softmax).
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+    base_only: if True will only create the base of the network (no pooling
+    and no logits).
+    **mobilenet_args: passed to mobilenet_base verbatim.
+      - conv_defs: list of conv defs
+      - multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+      - output_stride: will ensure that the last layer has at most total stride.
+      If the architecture calls for more stride than that provided
+      (e.g. output_stride=16, but the architecture has 5 stride=2 operators),
+      it will replace output_stride with fractional convolutions using Atrous
+      Convolutions.
+
+  Returns:
+    logits: the pre-softmax activations, a tensor of size
+      [batch_size, num_classes]
+    end_points: a dictionary from components of the network to the corresponding
+      activation tensor.
+
+  Raises:
+    ValueError: Input rank is invalid.
+  """
+  is_training = mobilenet_args.get('is_training', False)
+  input_shape = inputs.get_shape().as_list()
+  if len(input_shape) != 4:
+    raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
+
+  with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope:
+    inputs = tf.identity(inputs, 'input')
+    net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args)
+    if base_only:
+      return net, end_points
+
+    net = tf.identity(net, name='embedding')
+
+    with tf.variable_scope('Logits'):
+      net = global_pool(net)
+      end_points['global_pool'] = net
+      if not num_classes:
+        return net, end_points
+      net = slim.dropout(net, scope='Dropout', is_training=is_training)
+      # 1 x 1 x num_classes
+      # Note: legacy scope name.
+      logits = slim.conv2d(
+          net,
+          num_classes, [1, 1],
+          activation_fn=None,
+          normalizer_fn=None,
+          biases_initializer=tf.zeros_initializer(),
+          scope='Conv2d_1c_1x1')
+
+      logits = tf.squeeze(logits, [1, 2])
+
+      logits = tf.identity(logits, name='output')
+    end_points['Logits'] = logits
+    if prediction_fn:
+      end_points['Predictions'] = prediction_fn(logits, 'Predictions')
+  return logits, end_points
+
+
+def global_pool(input_tensor, pool_op=tf.nn.avg_pool):
+  """Applies avg pool to produce 1x1 output.
+
+  NOTE: This function is funcitonally equivalenet to reduce_mean, but it has
+  baked in average pool which has better support across hardware.
+
+  Args:
+    input_tensor: input tensor
+    pool_op: pooling op (avg pool is default)
+  Returns:
+    a tensor batch_size x 1 x 1 x depth.
+  """
+  shape = input_tensor.get_shape().as_list()
+  if shape[1] is None or shape[2] is None:
+    kernel_size = tf.convert_to_tensor(
+        [1, tf.shape(input_tensor)[1],
+         tf.shape(input_tensor)[2], 1])
+  else:
+    kernel_size = [1, shape[1], shape[2], 1]
+  output = pool_op(
+      input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
+  # Recover output shape, for unknown shape.
+  output.set_shape([None, 1, 1, None])
+  return output
+
+
+def training_scope(is_training=True,
+                   weight_decay=0.00004,
+                   stddev=0.09,
+                   dropout_keep_prob=0.8,
+                   bn_decay=0.997):
+  """Defines Mobilenet training scope.
+
+  Usage:
+     with tf.contrib.slim.arg_scope(mobilenet.training_scope()):
+       logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
+
+     # the network created will be trainble with dropout/batch norm
+     # initialized appropriately.
+  Args:
+    is_training: if set to False this will ensure that all customizations are
+      set to non-training mode. This might be helpful for code that is reused
+      across both training/evaluation, but most of the time training_scope with
+      value False is not needed. If this is set to None, the parameters is not
+      added to the batch_norm arg_scope.
+
+    weight_decay: The weight decay to use for regularizing the model.
+    stddev: Standard deviation for initialization, if negative uses xavier.
+    dropout_keep_prob: dropout keep probability (not set if equals to None).
+    bn_decay: decay for the batch norm moving averages (not set if equals to
+      None).
+
+  Returns:
+    An argument scope to use via arg_scope.
+  """
+  # Note: do not introduce parameters that would change the inference
+  # model here (for example whether to use bias), modify conv_def instead.
+  batch_norm_params = {
+      'decay': bn_decay,
+      'is_training': is_training
+  }
+  if stddev < 0:
+    weight_intitializer = slim.initializers.xavier_initializer()
+  else:
+    weight_intitializer = tf.truncated_normal_initializer(stddev=stddev)
+
+  # Set weight_decay for weights in Conv and FC layers.
+  with slim.arg_scope(
+      [slim.conv2d, slim.fully_connected, slim.separable_conv2d],
+      weights_initializer=weight_intitializer,
+      normalizer_fn=slim.batch_norm), \
+      slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\
+      safe_arg_scope([slim.batch_norm], **batch_norm_params), \
+      safe_arg_scope([slim.dropout], is_training=is_training,
+                     keep_prob=dropout_keep_prob), \
+      slim.arg_scope([slim.conv2d], \
+                     weights_regularizer=slim.l2_regularizer(weight_decay)), \
+      slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s:
+    return s
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_conv_blocks.py b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_conv_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..34016b277b6cc90700984a44247fb971ce708277
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_conv_blocks.py
@@ -0,0 +1,360 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convolution blocks for mobilenet."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import functools
+
+import tensorflow.compat.v1 as tf
+from tensorflow.contrib import slim
+
+
+def _fixed_padding(inputs, kernel_size, rate=1):
+  """Pads the input along the spatial dimensions independently of input size.
+
+  Pads the input such that if it was used in a convolution with 'VALID' padding,
+  the output would have the same dimensions as if the unpadded input was used
+  in a convolution with 'SAME' padding.
+
+  Args:
+    inputs: A tensor of size [batch, height_in, width_in, channels].
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+    rate: An integer, rate for atrous convolution.
+
+  Returns:
+    output: A tensor of size [batch, height_out, width_out, channels] with the
+      input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+  kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
+                           kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
+  pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
+  pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
+  pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
+  padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
+                                  [pad_beg[1], pad_end[1]], [0, 0]])
+  return padded_inputs
+
+
+def _make_divisible(v, divisor, min_value=None):
+  if min_value is None:
+    min_value = divisor
+  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_v < 0.9 * v:
+    new_v += divisor
+  return new_v
+
+
+def _split_divisible(num, num_ways, divisible_by=8):
+  """Evenly splits num, num_ways so each piece is a multiple of divisible_by."""
+  assert num % divisible_by == 0
+  assert num // num_ways >= divisible_by
+  # Note: want to round down, we adjust each split to match the total.
+  base = num // num_ways // divisible_by * divisible_by
+  result = []
+  accumulated = 0
+  for i in range(num_ways):
+    r = base
+    while accumulated + r < num * (i + 1) // num_ways:
+      r += divisible_by
+    result.append(r)
+    accumulated += r
+  assert accumulated == num
+  return result
+
+
+@contextlib.contextmanager
+def _v1_compatible_scope_naming(scope):  # pylint: disable=g-missing-docstring
+  if scope is None:  # Create uniqified separable blocks.
+    with tf.variable_scope(None, default_name='separable') as s, \
+         tf.name_scope(s.original_name_scope):
+      yield ''
+  else:
+    # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts.
+    # which provide numbered scopes.
+    scope += '_'
+    yield scope
+
+
+@slim.add_arg_scope
+def split_separable_conv2d(input_tensor,
+                           num_outputs,
+                           scope=None,
+                           normalizer_fn=None,
+                           stride=1,
+                           rate=1,
+                           endpoints=None,
+                           use_explicit_padding=False):
+  """Separable mobilenet V1 style convolution.
+
+  Depthwise convolution, with default non-linearity,
+  followed by 1x1 depthwise convolution.  This is similar to
+  slim.separable_conv2d, but differs in tha it applies batch
+  normalization and non-linearity to depthwise. This  matches
+  the basic building of Mobilenet Paper
+  (https://arxiv.org/abs/1704.04861)
+
+  Args:
+    input_tensor: input
+    num_outputs: number of outputs
+    scope: optional name of the scope. Note if provided it will use
+    scope_depthwise for deptwhise, and scope_pointwise for pointwise.
+    normalizer_fn: which normalizer function to use for depthwise/pointwise
+    stride: stride
+    rate: output rate (also known as dilation rate)
+    endpoints: optional, if provided, will export additional tensors to it.
+    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+      inputs so that the output dimensions are the same as if 'SAME' padding
+      were used.
+
+  Returns:
+    output tesnor
+  """
+
+  with _v1_compatible_scope_naming(scope) as scope:
+    dw_scope = scope + 'depthwise'
+    endpoints = endpoints if endpoints is not None else {}
+    kernel_size = [3, 3]
+    padding = 'SAME'
+    if use_explicit_padding:
+      padding = 'VALID'
+      input_tensor = _fixed_padding(input_tensor, kernel_size, rate)
+    net = slim.separable_conv2d(
+        input_tensor,
+        None,
+        kernel_size,
+        depth_multiplier=1,
+        stride=stride,
+        rate=rate,
+        normalizer_fn=normalizer_fn,
+        padding=padding,
+        scope=dw_scope)
+
+    endpoints[dw_scope] = net
+
+    pw_scope = scope + 'pointwise'
+    net = slim.conv2d(
+        net,
+        num_outputs, [1, 1],
+        stride=1,
+        normalizer_fn=normalizer_fn,
+        scope=pw_scope)
+    endpoints[pw_scope] = net
+  return net
+
+
+def expand_input_by_factor(n, divisible_by=8):
+  return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by)
+
+
+@slim.add_arg_scope
+def expanded_conv(input_tensor,
+                  num_outputs,
+                  expansion_size=expand_input_by_factor(6),
+                  stride=1,
+                  rate=1,
+                  kernel_size=(3, 3),
+                  residual=True,
+                  normalizer_fn=None,
+                  split_projection=1,
+                  split_expansion=1,
+                  expansion_transform=None,
+                  depthwise_location='expansion',
+                  depthwise_channel_multiplier=1,
+                  endpoints=None,
+                  use_explicit_padding=False,
+                  padding='SAME',
+                  scope=None):
+  """Depthwise Convolution Block with expansion.
+
+  Builds a composite convolution that has the following structure
+  expansion (1x1) -> depthwise (kernel_size) -> projection (1x1)
+
+  Args:
+    input_tensor: input
+    num_outputs: number of outputs in the final layer.
+    expansion_size: the size of expansion, could be a constant or a callable.
+      If latter it will be provided 'num_inputs' as an input. For forward
+      compatibility it should accept arbitrary keyword arguments.
+      Default will expand the input by factor of 6.
+    stride: depthwise stride
+    rate: depthwise rate
+    kernel_size: depthwise kernel
+    residual: whether to include residual connection between input
+      and output.
+    normalizer_fn: batchnorm or otherwise
+    split_projection: how many ways to split projection operator
+      (that is conv expansion->bottleneck)
+    split_expansion: how many ways to split expansion op
+      (that is conv bottleneck->expansion) ops will keep depth divisible
+      by this value.
+    expansion_transform: Optional function that takes expansion
+      as a single input and returns output.
+    depthwise_location: where to put depthwise covnvolutions supported
+      values None, 'input', 'output', 'expansion'
+    depthwise_channel_multiplier: depthwise channel multiplier:
+    each input will replicated (with different filters)
+    that many times. So if input had c channels,
+    output will have c x depthwise_channel_multpilier.
+    endpoints: An optional dictionary into which intermediate endpoints are
+      placed. The keys "expansion_output", "depthwise_output",
+      "projection_output" and "expansion_transform" are always populated, even
+      if the corresponding functions are not invoked.
+    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+      inputs so that the output dimensions are the same as if 'SAME' padding
+      were used.
+    padding: Padding type to use if `use_explicit_padding` is not set.
+    scope: optional scope.
+
+  Returns:
+    Tensor of depth num_outputs
+
+  Raises:
+    TypeError: on inval
+  """
+  with tf.variable_scope(scope, default_name='expanded_conv') as s, \
+       tf.name_scope(s.original_name_scope):
+    prev_depth = input_tensor.get_shape().as_list()[3]
+    if  depthwise_location not in [None, 'input', 'output', 'expansion']:
+      raise TypeError('%r is unknown value for depthwise_location' %
+                      depthwise_location)
+    if use_explicit_padding:
+      if padding != 'SAME':
+        raise TypeError('`use_explicit_padding` should only be used with '
+                        '"SAME" padding.')
+      padding = 'VALID'
+    depthwise_func = functools.partial(
+        slim.separable_conv2d,
+        num_outputs=None,
+        kernel_size=kernel_size,
+        depth_multiplier=depthwise_channel_multiplier,
+        stride=stride,
+        rate=rate,
+        normalizer_fn=normalizer_fn,
+        padding=padding,
+        scope='depthwise')
+    # b1 -> b2 * r -> b2
+    #   i -> (o * r) (bottleneck) -> o
+    input_tensor = tf.identity(input_tensor, 'input')
+    net = input_tensor
+
+    if depthwise_location == 'input':
+      if use_explicit_padding:
+        net = _fixed_padding(net, kernel_size, rate)
+      net = depthwise_func(net, activation_fn=None)
+
+    if callable(expansion_size):
+      inner_size = expansion_size(num_inputs=prev_depth)
+    else:
+      inner_size = expansion_size
+
+    if inner_size > net.shape[3]:
+      net = split_conv(
+          net,
+          inner_size,
+          num_ways=split_expansion,
+          scope='expand',
+          stride=1,
+          normalizer_fn=normalizer_fn)
+      net = tf.identity(net, 'expansion_output')
+    if endpoints is not None:
+      endpoints['expansion_output'] = net
+
+    if depthwise_location == 'expansion':
+      if use_explicit_padding:
+        net = _fixed_padding(net, kernel_size, rate)
+      net = depthwise_func(net)
+
+    net = tf.identity(net, name='depthwise_output')
+    if endpoints is not None:
+      endpoints['depthwise_output'] = net
+    if expansion_transform:
+      net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor)
+    # Note in contrast with expansion, we always have
+    # projection to produce the desired output size.
+    net = split_conv(
+        net,
+        num_outputs,
+        num_ways=split_projection,
+        stride=1,
+        scope='project',
+        normalizer_fn=normalizer_fn,
+        activation_fn=tf.identity)
+    if endpoints is not None:
+      endpoints['projection_output'] = net
+    if depthwise_location == 'output':
+      if use_explicit_padding:
+        net = _fixed_padding(net, kernel_size, rate)
+      net = depthwise_func(net, activation_fn=None)
+
+    if callable(residual):  # custom residual
+      net = residual(input_tensor=input_tensor, output_tensor=net)
+    elif (residual and
+          # stride check enforces that we don't add residuals when spatial
+          # dimensions are None
+          stride == 1 and
+          # Depth matches
+          net.get_shape().as_list()[3] ==
+          input_tensor.get_shape().as_list()[3]):
+      net += input_tensor
+    return tf.identity(net, name='output')
+
+
+def split_conv(input_tensor,
+               num_outputs,
+               num_ways,
+               scope,
+               divisible_by=8,
+               **kwargs):
+  """Creates a split convolution.
+
+  Split convolution splits the input and output into
+  'num_blocks' blocks of approximately the same size each,
+  and only connects $i$-th input to $i$ output.
+
+  Args:
+    input_tensor: input tensor
+    num_outputs: number of output filters
+    num_ways: num blocks to split by.
+    scope: scope for all the operators.
+    divisible_by: make sure that every part is divisiable by this.
+    **kwargs: will be passed directly into conv2d operator
+  Returns:
+    tensor
+  """
+  b = input_tensor.get_shape().as_list()[3]
+
+  if num_ways == 1 or min(b // num_ways,
+                          num_outputs // num_ways) < divisible_by:
+    # Don't do any splitting if we end up with less than 8 filters
+    # on either side.
+    return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs)
+
+  outs = []
+  input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by)
+  output_splits = _split_divisible(
+      num_outputs, num_ways, divisible_by=divisible_by)
+  inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope)
+  base = scope
+  for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)):
+    scope = base + '_part_%d' % (i,)
+    n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs)
+    n = tf.identity(n, scope + '_output')
+    outs.append(n)
+  return tf.concat(outs, 3, name=scope + '_concat')
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_test.py b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0b7d5345077585b99a6a6b5e305388bfcc5eaf0
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_test.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for mobilenet_v2, branched from slim for fp16 performance study."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import tensorflow.compat.v1 as tf
+
+from models.tf1_only import mobilenet
+from models.tf1_only import mobilenet_conv_blocks as ops
+from models.tf1_only import mobilenet_v2
+from tensorflow.contrib import slim
+
+
+def find_ops(optype):
+  """Find ops of a given type in graphdef or a graph.
+
+  Args:
+    optype: operation type (e.g. Conv2D)
+  Returns:
+     List of operations.
+  """
+  gd = tf.get_default_graph()
+  return [var for var in gd.get_operations() if var.type == optype]
+
+
+class MobilenetV2Test(tf.test.TestCase):
+
+  def setUp(self):  # pylint: disable=g-missing-super-call
+    tf.reset_default_graph()
+
+  def testCreation(self):
+    spec = dict(mobilenet_v2.V2_DEF)
+    _, ep = mobilenet.mobilenet(
+        tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=spec)
+    num_convs = len(find_ops('Conv2D'))
+
+    # This is mostly a sanity test. No deep reason for these particular
+    # constants.
+    #
+    # All but first 2 and last one have  two convolutions, and there is one
+    # extra conv that is not in the spec. (logits)
+    self.assertEqual(num_convs, len(spec['spec']) * 2 - 2)
+    # Check that depthwise are exposed.
+    for i in range(2, 17):
+      self.assertIn('layer_%d/depthwise_output' % i, ep)
+
+  def testCreationNoClasses(self):
+    spec = copy.deepcopy(mobilenet_v2.V2_DEF)
+    net, ep = mobilenet.mobilenet(
+        tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=spec,
+        num_classes=None)
+    self.assertIs(net, ep['global_pool'])
+
+  def testImageSizes(self):
+    for input_size, output_size in [(224, 7), (192, 6), (160, 5),
+                                    (128, 4), (96, 3)]:
+      tf.reset_default_graph()
+      _, ep = mobilenet_v2.mobilenet(
+          tf.placeholder(tf.float32, (10, input_size, input_size, 3)))
+
+      self.assertEqual(ep['layer_18/output'].get_shape().as_list()[1:3],
+                       [output_size] * 2)
+
+  def testWithSplits(self):
+    spec = copy.deepcopy(mobilenet_v2.V2_DEF)
+    spec['overrides'] = {
+        (ops.expanded_conv,): dict(split_expansion=2),
+    }
+    _, _ = mobilenet.mobilenet(
+        tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=spec)
+    num_convs = len(find_ops('Conv2D'))
+    # All but 3 op has 3 conv operatore, the remainign 3 have one
+    # and there is one unaccounted.
+    self.assertEqual(num_convs, len(spec['spec']) * 3 - 5)
+
+  def testWithOutputStride8(self):
+    out, _ = mobilenet.mobilenet_base(
+        tf.placeholder(tf.float32, (10, 224, 224, 16)),
+        conv_defs=mobilenet_v2.V2_DEF,
+        output_stride=8,
+        scope='MobilenetV2')
+    self.assertEqual(out.get_shape().as_list()[1:3], [28, 28])
+
+  def testDivisibleBy(self):
+    tf.reset_default_graph()
+    mobilenet_v2.mobilenet(
+        tf.placeholder(tf.float32, (10, 224, 224, 16)),
+        conv_defs=mobilenet_v2.V2_DEF,
+        divisible_by=16,
+        min_depth=32)
+    s = [op.outputs[0].get_shape().as_list()[-1] for op in find_ops('Conv2D')]
+    s = set(s)
+    self.assertSameElements([32, 64, 96, 160, 192, 320, 384, 576, 960, 1280,
+                             1001], s)
+
+  def testDivisibleByWithArgScope(self):
+    tf.reset_default_graph()
+    # Verifies that depth_multiplier arg scope actually works
+    # if no default min_depth is provided.
+    with slim.arg_scope((mobilenet.depth_multiplier,), min_depth=32):
+      mobilenet_v2.mobilenet(
+          tf.placeholder(tf.float32, (10, 224, 224, 2)),
+          conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=0.1)
+      s = [op.outputs[0].get_shape().as_list()[-1] for op in find_ops('Conv2D')]
+      s = set(s)
+      self.assertSameElements(s, [32, 192, 128, 1001])
+
+  def testFineGrained(self):
+    tf.reset_default_graph()
+    # Verifies that depth_multiplier arg scope actually works
+    # if no default min_depth is provided.
+
+    mobilenet_v2.mobilenet(
+        tf.placeholder(tf.float32, (10, 224, 224, 2)),
+        conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=0.01,
+        finegrain_classification_mode=True)
+    s = [op.outputs[0].get_shape().as_list()[-1] for op in find_ops('Conv2D')]
+    s = set(s)
+    # All convolutions will be 8->48, except for the last one.
+    self.assertSameElements(s, [8, 48, 1001, 1280])
+
+  def testMobilenetBase(self):
+    tf.reset_default_graph()
+    # Verifies that mobilenet_base returns pre-pooling layer.
+    with slim.arg_scope((mobilenet.depth_multiplier,), min_depth=32):
+      net, _ = mobilenet_v2.mobilenet_base(
+          tf.placeholder(tf.float32, (10, 224, 224, 16)),
+          conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=0.1)
+      self.assertEqual(net.get_shape().as_list(), [10, 7, 7, 128])
+
+  def testWithOutputStride16(self):
+    tf.reset_default_graph()
+    out, _ = mobilenet.mobilenet_base(
+        tf.placeholder(tf.float32, (10, 224, 224, 16)),
+        conv_defs=mobilenet_v2.V2_DEF,
+        output_stride=16)
+    self.assertEqual(out.get_shape().as_list()[1:3], [14, 14])
+
+  def testWithOutputStride8AndExplicitPadding(self):
+    tf.reset_default_graph()
+    out, _ = mobilenet.mobilenet_base(
+        tf.placeholder(tf.float32, (10, 224, 224, 16)),
+        conv_defs=mobilenet_v2.V2_DEF,
+        output_stride=8,
+        use_explicit_padding=True,
+        scope='MobilenetV2')
+    self.assertEqual(out.get_shape().as_list()[1:3], [28, 28])
+
+  def testWithOutputStride16AndExplicitPadding(self):
+    tf.reset_default_graph()
+    out, _ = mobilenet.mobilenet_base(
+        tf.placeholder(tf.float32, (10, 224, 224, 16)),
+        conv_defs=mobilenet_v2.V2_DEF,
+        output_stride=16,
+        use_explicit_padding=True)
+    self.assertEqual(out.get_shape().as_list()[1:3], [14, 14])
+
+  def testBatchNormScopeDoesNotHaveIsTrainingWhenItsSetToNone(self):
+    sc = mobilenet.training_scope(is_training=None)
+    self.assertNotIn('is_training', sc[slim.arg_scope_func_key(
+        slim.batch_norm)])
+
+  def testBatchNormScopeDoesHasIsTrainingWhenItsNotNone(self):
+    sc = mobilenet.training_scope(is_training=False)
+    self.assertIn('is_training', sc[slim.arg_scope_func_key(slim.batch_norm)])
+    sc = mobilenet.training_scope(is_training=True)
+    self.assertIn('is_training', sc[slim.arg_scope_func_key(slim.batch_norm)])
+    sc = mobilenet.training_scope()
+    self.assertIn('is_training', sc[slim.arg_scope_func_key(slim.batch_norm)])
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_v2.py b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac811470719c6a3f867fd88484aaa862bce09e76
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/tf1_only/mobilenet_v2.py
@@ -0,0 +1,198 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mobilenet V2 model, branched from slim models for fp16 performance study.
+
+Architecture: https://arxiv.org/abs/1801.04381
+
+The base model gives 72.2% accuracy on ImageNet, with 300MMadds,
+3.4 M parameters.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import tensorflow.compat.v1 as tf
+
+from models import model
+from models.tf1_only import mobilenet as lib
+from models.tf1_only import mobilenet_conv_blocks as ops
+from tensorflow.contrib import slim
+
+op = lib.op
+
+expand_input = ops.expand_input_by_factor
+
+# pyformat: disable
+# Architecture: https://arxiv.org/abs/1801.04381
+V2_DEF = dict(
+    defaults={
+        # Note: these parameters of batch norm affect the architecture
+        # that's why they are here and not in training_scope.
+        (slim.batch_norm,): {'center': True, 'scale': True},
+        (slim.conv2d, slim.fully_connected, slim.separable_conv2d): {
+            'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6
+        },
+        (ops.expanded_conv,): {
+            'expansion_size': expand_input(6),
+            'split_expansion': 1,
+            'normalizer_fn': slim.batch_norm,
+            'residual': True
+        },
+        (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'}
+    },
+    spec=[
+        op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]),
+        op(ops.expanded_conv,
+           expansion_size=expand_input(1, divisible_by=1),
+           num_outputs=16),
+        op(ops.expanded_conv, stride=2, num_outputs=24),
+        op(ops.expanded_conv, stride=1, num_outputs=24),
+        op(ops.expanded_conv, stride=2, num_outputs=32),
+        op(ops.expanded_conv, stride=1, num_outputs=32),
+        op(ops.expanded_conv, stride=1, num_outputs=32),
+        op(ops.expanded_conv, stride=2, num_outputs=64),
+        op(ops.expanded_conv, stride=1, num_outputs=64),
+        op(ops.expanded_conv, stride=1, num_outputs=64),
+        op(ops.expanded_conv, stride=1, num_outputs=64),
+        op(ops.expanded_conv, stride=1, num_outputs=96),
+        op(ops.expanded_conv, stride=1, num_outputs=96),
+        op(ops.expanded_conv, stride=1, num_outputs=96),
+        op(ops.expanded_conv, stride=2, num_outputs=160),
+        op(ops.expanded_conv, stride=1, num_outputs=160),
+        op(ops.expanded_conv, stride=1, num_outputs=160),
+        op(ops.expanded_conv, stride=1, num_outputs=320),
+        op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280)
+    ],
+)
+# pyformat: enable
+
+
+@slim.add_arg_scope
+def mobilenet(input_tensor,
+              num_classes=1001,
+              depth_multiplier=1.0,
+              scope='MobilenetV2',
+              conv_defs=None,
+              finegrain_classification_mode=False,
+              min_depth=None,
+              divisible_by=None,
+              **kwargs):
+  """Creates mobilenet V2 network.
+
+  Inference mode is created by default. To create training use training_scope
+  below.
+
+  with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()):
+     logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
+
+  Args:
+    input_tensor: The input tensor
+    num_classes: number of classes
+    depth_multiplier: The multiplier applied to scale number of
+    channels in each layer. Note: this is called depth multiplier in the
+    paper but the name is kept for consistency with slim's model builder.
+    scope: Scope of the operator
+    conv_defs: Allows to override default conv def.
+    finegrain_classification_mode: When set to True, the model
+    will keep the last layer large even for small multipliers. Following
+    https://arxiv.org/abs/1801.04381
+    suggests that it improves performance for ImageNet-type of problems.
+      *Note* ignored if final_endpoint makes the builder exit earlier.
+    min_depth: If provided, will ensure that all layers will have that
+    many channels after application of depth multiplier.
+    divisible_by: If provided will ensure that all layers # channels
+    will be divisible by this number.
+    **kwargs: passed directly to mobilenet.mobilenet:
+      prediction_fn- what prediction function to use.
+      reuse-: whether to reuse variables (if reuse set to true, scope
+      must be given).
+  Returns:
+    logits/endpoints pair
+
+  Raises:
+    ValueError: On invalid arguments
+  """
+  if conv_defs is None:
+    conv_defs = V2_DEF
+  if 'multiplier' in kwargs:
+    raise ValueError('mobilenetv2 doesn\'t support generic '
+                     'multiplier parameter use "depth_multiplier" instead.')
+  if finegrain_classification_mode:
+    conv_defs = copy.deepcopy(conv_defs)
+    if depth_multiplier < 1:
+      conv_defs['spec'][-1].params['num_outputs'] /= depth_multiplier
+
+  depth_args = {}
+  # NB: do not set depth_args unless they are provided to avoid overriding
+  # whatever default depth_multiplier might have thanks to arg_scope.
+  if min_depth is not None:
+    depth_args['min_depth'] = min_depth
+  if divisible_by is not None:
+    depth_args['divisible_by'] = divisible_by
+
+  with slim.arg_scope((lib.depth_multiplier,), **depth_args):
+    return lib.mobilenet(
+        input_tensor,
+        num_classes=num_classes,
+        conv_defs=conv_defs,
+        scope=scope,
+        multiplier=depth_multiplier,
+        **kwargs)
+
+
+@slim.add_arg_scope
+def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs):
+  """Creates base of the mobilenet (no pooling and no logits) ."""
+  return mobilenet(
+      input_tensor, depth_multiplier=depth_multiplier, base_only=True, **kwargs)
+
+
+def training_scope(**kwargs):
+  """Defines MobilenetV2 training scope.
+
+  Usage:
+     with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()):
+       logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
+
+  with slim.
+
+  Args:
+    **kwargs: Passed to mobilenet.training_scope. The following parameters
+    are supported:
+      weight_decay- The weight decay to use for regularizing the model.
+      stddev-  Standard deviation for initialization, if negative uses xavier.
+      dropout_keep_prob- dropout keep probability
+      bn_decay- decay for the batch norm moving averages.
+
+  Returns:
+    An `arg_scope` to use for the mobilenet v2 model.
+  """
+  return lib.training_scope(**kwargs)
+
+
+class MobilenetModel(model.CNNModel):
+  """Mobilenet model configuration."""
+
+  def __init__(self, params=None):
+    super(MobilenetModel, self).__init__(
+        'mobilenet', 224, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    with slim.arg_scope(training_scope(is_training=cnn.phase_train)):
+      cnn.top_layer, _ = mobilenet(cnn.top_layer, is_training=cnn.phase_train)
+      cnn.top_size = cnn.top_layer.shape[-1].value
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_model.py b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..560d86bcaf88589734696748379150a6615a58fc
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_model.py
@@ -0,0 +1,582 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model configurations for nasnet.
+
+Paper: https://arxiv.org/abs/1707.07012
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+from models import model
+from models.tf1_only import nasnet_utils
+from tensorflow.contrib import framework as contrib_framework
+from tensorflow.contrib import layers as contrib_layers
+from tensorflow.contrib import slim
+from tensorflow.contrib import training as contrib_training
+
+arg_scope = contrib_framework.arg_scope
+
+
+# Notes for training NASNet Cifar Model
+# -------------------------------------
+# batch_size: 32
+# learning rate: 0.025
+# cosine (single period) learning rate decay
+# auxiliary head loss weighting: 0.4
+# clip global norm of all gradients by 5
+def _cifar_config(is_training=True, data_format=None, total_steps=None):
+  drop_path_keep_prob = 1.0 if not is_training else 0.6
+  return contrib_training.HParams(
+      stem_multiplier=3.0,
+      drop_path_keep_prob=drop_path_keep_prob,
+      num_cells=18,
+      use_aux_head=1,
+      num_conv_filters=32,
+      dense_dropout_keep_prob=1.0,
+      filter_scaling_rate=2.0,
+      num_reduction_layers=2,
+      skip_reduction_layer_input=0,
+      data_format=data_format or 'NHWC',
+      # 600 epochs with a batch size of 32
+      # This is used for the drop path probabilities since it needs to increase
+      # the drop out probability over the course of training.
+      total_training_steps=total_steps or 937500,
+  )
+
+
+# Notes for training large NASNet model on ImageNet
+# -------------------------------------
+# batch size (per replica): 16
+# learning rate: 0.015 * 100
+# learning rate decay factor: 0.97
+# num epochs per decay: 2.4
+# sync sgd with 100 replicas
+# auxiliary head loss weighting: 0.4
+# label smoothing: 0.1
+# clip global norm of all gradients by 10
+def _large_imagenet_config(is_training=True, data_format=None,
+                           total_steps=None):
+  drop_path_keep_prob = 1.0 if not is_training else 0.7
+  return contrib_training.HParams(
+      stem_multiplier=3.0,
+      dense_dropout_keep_prob=0.5,
+      num_cells=18,
+      filter_scaling_rate=2.0,
+      num_conv_filters=168,
+      drop_path_keep_prob=drop_path_keep_prob,
+      use_aux_head=1,
+      num_reduction_layers=2,
+      skip_reduction_layer_input=1,
+      data_format=data_format or 'NHWC',
+      total_training_steps=total_steps or 250000,
+  )
+
+
+# Notes for training the mobile NASNet ImageNet model
+# -------------------------------------
+# batch size (per replica): 32
+# learning rate: 0.04 * 50
+# learning rate scaling factor: 0.97
+# num epochs per decay: 2.4
+# sync sgd with 50 replicas
+# auxiliary head weighting: 0.4
+# label smoothing: 0.1
+# clip global norm of all gradients by 10
+def _mobile_imagenet_config(data_format=None, total_steps=None):
+  return contrib_training.HParams(
+      stem_multiplier=1.0,
+      dense_dropout_keep_prob=0.5,
+      num_cells=12,
+      filter_scaling_rate=2.0,
+      drop_path_keep_prob=1.0,
+      num_conv_filters=44,
+      use_aux_head=1,
+      num_reduction_layers=2,
+      skip_reduction_layer_input=0,
+      data_format=data_format or 'NHWC',
+      total_training_steps=total_steps or 250000,
+  )
+
+
+def nasnet_cifar_arg_scope(weight_decay=5e-4,
+                           batch_norm_decay=0.9,
+                           batch_norm_epsilon=1e-5):
+  """Defines the default arg scope for the NASNet-A Cifar model.
+
+  Args:
+    weight_decay: The weight decay to use for regularizing the model.
+    batch_norm_decay: Decay for batch norm moving average.
+    batch_norm_epsilon: Small float added to variance to avoid dividing by zero
+      in batch norm.
+  Returns:
+    An `arg_scope` to use for the NASNet Cifar Model.
+  """
+  batch_norm_params = {
+      # Decay for the moving averages.
+      'decay': batch_norm_decay,
+      # epsilon to prevent 0s in variance.
+      'epsilon': batch_norm_epsilon,
+      'scale': True,
+      'fused': True,
+  }
+  weights_regularizer = contrib_layers.l2_regularizer(weight_decay)
+  weights_initializer = contrib_layers.variance_scaling_initializer(
+      mode='FAN_OUT')
+  with arg_scope(
+      [slim.fully_connected, slim.conv2d, slim.separable_conv2d],
+      weights_regularizer=weights_regularizer,
+      weights_initializer=weights_initializer):
+    with arg_scope([slim.fully_connected], activation_fn=None, scope='FC'):
+      with arg_scope(
+          [slim.conv2d, slim.separable_conv2d],
+          activation_fn=None,
+          biases_initializer=None):
+        with arg_scope([slim.batch_norm], **batch_norm_params) as sc:
+          return sc
+
+
+def nasnet_mobile_arg_scope(weight_decay=4e-5,
+                            batch_norm_decay=0.9997,
+                            batch_norm_epsilon=1e-3):
+  """Defines the default arg scope for the NASNet-A Mobile ImageNet model.
+
+  Args:
+    weight_decay: The weight decay to use for regularizing the model.
+    batch_norm_decay: Decay for batch norm moving average.
+    batch_norm_epsilon: Small float added to variance to avoid dividing by zero
+      in batch norm.
+  Returns:
+    An `arg_scope` to use for the NASNet Mobile Model.
+  """
+  batch_norm_params = {
+      # Decay for the moving averages.
+      'decay': batch_norm_decay,
+      # epsilon to prevent 0s in variance.
+      'epsilon': batch_norm_epsilon,
+      'scale': True,
+      'fused': True,
+  }
+  weights_regularizer = contrib_layers.l2_regularizer(weight_decay)
+  weights_initializer = contrib_layers.variance_scaling_initializer(
+      mode='FAN_OUT')
+  with arg_scope(
+      [slim.fully_connected, slim.conv2d, slim.separable_conv2d],
+      weights_regularizer=weights_regularizer,
+      weights_initializer=weights_initializer):
+    with arg_scope([slim.fully_connected], activation_fn=None, scope='FC'):
+      with arg_scope(
+          [slim.conv2d, slim.separable_conv2d],
+          activation_fn=None,
+          biases_initializer=None):
+        with arg_scope([slim.batch_norm], **batch_norm_params) as sc:
+          return sc
+
+
+def nasnet_large_arg_scope(weight_decay=5e-5,
+                           batch_norm_decay=0.9997,
+                           batch_norm_epsilon=1e-3):
+  """Defines the default arg scope for the NASNet-A Large ImageNet model.
+
+  Args:
+    weight_decay: The weight decay to use for regularizing the model.
+    batch_norm_decay: Decay for batch norm moving average.
+    batch_norm_epsilon: Small float added to variance to avoid dividing by zero
+      in batch norm.
+  Returns:
+    An `arg_scope` to use for the NASNet Large Model.
+  """
+  batch_norm_params = {
+      # Decay for the moving averages.
+      'decay': batch_norm_decay,
+      # epsilon to prevent 0s in variance.
+      'epsilon': batch_norm_epsilon,
+      'scale': True,
+      'fused': True,
+  }
+  weights_regularizer = contrib_layers.l2_regularizer(weight_decay)
+  weights_initializer = contrib_layers.variance_scaling_initializer(
+      mode='FAN_OUT')
+  with arg_scope(
+      [slim.fully_connected, slim.conv2d, slim.separable_conv2d],
+      weights_regularizer=weights_regularizer,
+      weights_initializer=weights_initializer):
+    with arg_scope([slim.fully_connected], activation_fn=None, scope='FC'):
+      with arg_scope(
+          [slim.conv2d, slim.separable_conv2d],
+          activation_fn=None,
+          biases_initializer=None):
+        with arg_scope([slim.batch_norm], **batch_norm_params) as sc:
+          return sc
+
+
+def _build_aux_head(net, end_points, num_classes, hparams, scope):
+  """Auxiliary head used for all models across all datasets."""
+  with tf.variable_scope(scope):
+    aux_logits = tf.identity(net)
+    with tf.variable_scope('aux_logits'):
+      aux_logits = slim.avg_pool2d(
+          aux_logits, [5, 5], stride=3, padding='VALID')
+      aux_logits = slim.conv2d(aux_logits, 128, [1, 1], scope='proj')
+      aux_logits = slim.batch_norm(aux_logits, scope='aux_bn0')
+      aux_logits = tf.nn.relu(aux_logits)
+      # Shape of feature map before the final layer.
+      shape = aux_logits.shape
+      if hparams.data_format == 'NHWC':
+        shape = shape[1:3]
+      else:
+        shape = shape[2:4]
+      aux_logits = slim.conv2d(aux_logits, 768, shape, padding='VALID')
+      aux_logits = slim.batch_norm(aux_logits, scope='aux_bn1')
+      aux_logits = tf.nn.relu(aux_logits)
+      aux_logits = contrib_layers.flatten(aux_logits)
+      aux_logits = slim.fully_connected(aux_logits, num_classes)
+      end_points['AuxLogits'] = aux_logits
+
+
+def _imagenet_stem(inputs, hparams, stem_cell):
+  """Stem used for models trained on ImageNet."""
+  num_stem_cells = 2
+
+  # 149 x 149 x 32
+  num_stem_filters = int(32 * hparams.stem_multiplier)
+  net = slim.conv2d(
+      inputs,
+      num_stem_filters, [3, 3],
+      stride=2,
+      scope='conv0',
+      padding='VALID')
+  net = slim.batch_norm(net, scope='conv0_bn')
+
+  # Run the reduction cells
+  cell_outputs = [None, net]
+  filter_scaling = 1.0 / (hparams.filter_scaling_rate**num_stem_cells)
+  for cell_num in range(num_stem_cells):
+    net = stem_cell(
+        net,
+        scope='cell_stem_{}'.format(cell_num),
+        filter_scaling=filter_scaling,
+        stride=2,
+        prev_layer=cell_outputs[-2],
+        cell_num=cell_num)
+    cell_outputs.append(net)
+    filter_scaling *= hparams.filter_scaling_rate
+  return net, cell_outputs
+
+
+def _cifar_stem(inputs, hparams):
+  """Stem used for models trained on Cifar."""
+  num_stem_filters = int(hparams.num_conv_filters * hparams.stem_multiplier)
+  net = slim.conv2d(inputs, num_stem_filters, 3, scope='l1_stem_3x3')
+  net = slim.batch_norm(net, scope='l1_stem_bn')
+  return net, [None, net]
+
+
+def build_nasnet_cifar(images,
+                       num_classes=None,
+                       is_training=True,
+                       data_format=None,
+                       total_steps=None):
+  """Build NASNet model for the Cifar Dataset."""
+  hparams = _cifar_config(
+      is_training=is_training, data_format=data_format, total_steps=total_steps)
+
+  if tf.test.is_gpu_available() and hparams.data_format == 'NHWC':
+    tf.logging.info('A GPU is available on the machine, consider using NCHW '
+                    'data format for increased speed on GPU.')
+
+  # Calculate the total number of cells in the network
+  # Add 2 for the reduction cells
+  total_num_cells = hparams.num_cells + 2
+
+  normal_cell = nasnet_utils.NasNetANormalCell(
+      hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells,
+      hparams.total_training_steps)
+  reduction_cell = nasnet_utils.NasNetAReductionCell(
+      hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells,
+      hparams.total_training_steps)
+  with arg_scope(
+      [slim.dropout, nasnet_utils.drop_path, slim.batch_norm],
+      is_training=is_training):
+    with arg_scope(
+        [
+            slim.avg_pool2d, slim.max_pool2d, slim.conv2d, slim.batch_norm,
+            slim.separable_conv2d, nasnet_utils.factorized_reduction,
+            nasnet_utils.global_avg_pool, nasnet_utils.get_channel_index,
+            nasnet_utils.get_channel_dim
+        ],
+        data_format=hparams.data_format):
+      return _build_nasnet_base(
+          images,
+          normal_cell=normal_cell,
+          reduction_cell=reduction_cell,
+          num_classes=num_classes,
+          hparams=hparams,
+          is_training=is_training,
+          stem_type='cifar')
+
+
+build_nasnet_cifar.default_image_size = 32
+
+
+def build_nasnet_mobile(images,
+                        num_classes=None,
+                        is_training=True,
+                        data_format=None,
+                        total_steps=None,
+                        final_endpoint=None):
+  """Build NASNet Mobile model for the ImageNet Dataset."""
+  hparams = _mobile_imagenet_config(
+      data_format=data_format, total_steps=total_steps)
+
+  if tf.test.is_gpu_available() and hparams.data_format == 'NHWC':
+    tf.logging.info('A GPU is available on the machine, consider using NCHW '
+                    'data format for increased speed on GPU.')
+
+  # Calculate the total number of cells in the network
+  # Add 2 for the reduction cells
+  total_num_cells = hparams.num_cells + 2
+  # If ImageNet, then add an additional two for the stem cells
+  total_num_cells += 2
+
+  normal_cell = nasnet_utils.NasNetANormalCell(
+      hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells,
+      hparams.total_training_steps)
+  reduction_cell = nasnet_utils.NasNetAReductionCell(
+      hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells,
+      hparams.total_training_steps)
+  with arg_scope(
+      [slim.dropout, nasnet_utils.drop_path, slim.batch_norm],
+      is_training=is_training):
+    with arg_scope(
+        [
+            slim.avg_pool2d, slim.max_pool2d, slim.conv2d, slim.batch_norm,
+            slim.separable_conv2d, nasnet_utils.factorized_reduction,
+            nasnet_utils.global_avg_pool, nasnet_utils.get_channel_index,
+            nasnet_utils.get_channel_dim
+        ],
+        data_format=hparams.data_format):
+      return _build_nasnet_base(
+          images,
+          normal_cell=normal_cell,
+          reduction_cell=reduction_cell,
+          num_classes=num_classes,
+          hparams=hparams,
+          is_training=is_training,
+          stem_type='imagenet',
+          final_endpoint=final_endpoint)
+
+
+build_nasnet_mobile.default_image_size = 224
+
+
+def build_nasnet_large(images,
+                       num_classes=None,
+                       is_training=True,
+                       data_format=None,
+                       total_steps=None,
+                       final_endpoint=None):
+  """Build NASNet Large model for the ImageNet Dataset."""
+  hparams = _large_imagenet_config(
+      is_training=is_training, data_format=data_format, total_steps=total_steps)
+
+  if tf.test.is_gpu_available() and hparams.data_format == 'NHWC':
+    tf.logging.info('A GPU is available on the machine, consider using NCHW '
+                    'data format for increased speed on GPU.')
+
+  # Calculate the total number of cells in the network
+  # Add 2 for the reduction cells
+  total_num_cells = hparams.num_cells + 2
+  # If ImageNet, then add an additional two for the stem cells
+  total_num_cells += 2
+
+  normal_cell = nasnet_utils.NasNetANormalCell(
+      hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells,
+      hparams.total_training_steps)
+  reduction_cell = nasnet_utils.NasNetAReductionCell(
+      hparams.num_conv_filters, hparams.drop_path_keep_prob, total_num_cells,
+      hparams.total_training_steps)
+  with arg_scope(
+      [slim.dropout, nasnet_utils.drop_path, slim.batch_norm],
+      is_training=is_training):
+    with arg_scope(
+        [
+            slim.avg_pool2d, slim.max_pool2d, slim.conv2d, slim.batch_norm,
+            slim.separable_conv2d, nasnet_utils.factorized_reduction,
+            nasnet_utils.global_avg_pool, nasnet_utils.get_channel_index,
+            nasnet_utils.get_channel_dim
+        ],
+        data_format=hparams.data_format):
+      return _build_nasnet_base(
+          images,
+          normal_cell=normal_cell,
+          reduction_cell=reduction_cell,
+          num_classes=num_classes,
+          hparams=hparams,
+          is_training=is_training,
+          stem_type='imagenet',
+          final_endpoint=final_endpoint)
+
+
+build_nasnet_large.default_image_size = 331
+
+
+def _build_nasnet_base(images,
+                       normal_cell,
+                       reduction_cell,
+                       num_classes,
+                       hparams,
+                       is_training,
+                       stem_type,
+                       final_endpoint=None):
+  """Constructs a NASNet image model."""
+
+  end_points = {}
+
+  def add_and_check_endpoint(endpoint_name, net):
+    end_points[endpoint_name] = net
+    return final_endpoint and (endpoint_name == final_endpoint)
+
+  # Find where to place the reduction cells or stride normal cells
+  reduction_indices = nasnet_utils.calc_reduction_layers(
+      hparams.num_cells, hparams.num_reduction_layers)
+  stem_cell = reduction_cell
+
+  if stem_type == 'imagenet':
+    stem = lambda: _imagenet_stem(images, hparams, stem_cell)
+  elif stem_type == 'cifar':
+    stem = lambda: _cifar_stem(images, hparams)
+  else:
+    raise ValueError('Unknown stem_type: ', stem_type)
+  net, cell_outputs = stem()
+  if add_and_check_endpoint('Stem', net):
+    return net, end_points
+
+  # Setup for building in the auxiliary head.
+  aux_head_cell_idxes = []
+  if len(reduction_indices) >= 2:
+    aux_head_cell_idxes.append(reduction_indices[1] - 1)
+
+  # Run the cells
+  filter_scaling = 1.0
+  # true_cell_num accounts for the stem cells
+  true_cell_num = 2 if stem_type == 'imagenet' else 0
+  for cell_num in range(hparams.num_cells):
+    stride = 1
+    if hparams.skip_reduction_layer_input:
+      prev_layer = cell_outputs[-2]
+    if cell_num in reduction_indices:
+      filter_scaling *= hparams.filter_scaling_rate
+      net = reduction_cell(
+          net,
+          scope='reduction_cell_{}'.format(reduction_indices.index(cell_num)),
+          filter_scaling=filter_scaling,
+          stride=2,
+          prev_layer=cell_outputs[-2],
+          cell_num=true_cell_num)
+      if add_and_check_endpoint(
+          'Reduction_Cell_{}'.format(reduction_indices.index(cell_num)), net):
+        return net, end_points
+      true_cell_num += 1
+      cell_outputs.append(net)
+    if not hparams.skip_reduction_layer_input:
+      prev_layer = cell_outputs[-2]
+    net = normal_cell(
+        net,
+        scope='cell_{}'.format(cell_num),
+        filter_scaling=filter_scaling,
+        stride=stride,
+        prev_layer=prev_layer,
+        cell_num=true_cell_num)
+
+    if add_and_check_endpoint('Cell_{}'.format(cell_num), net):
+      return net, end_points
+    true_cell_num += 1
+    if (hparams.use_aux_head and cell_num in aux_head_cell_idxes and
+        num_classes and is_training):
+      aux_net = tf.nn.relu(net)
+      _build_aux_head(
+          aux_net,
+          end_points,
+          num_classes,
+          hparams,
+          scope='aux_{}'.format(cell_num))
+    cell_outputs.append(net)
+
+  # Final softmax layer
+  with tf.variable_scope('final_layer'):
+    net = tf.nn.relu(net)
+    net = nasnet_utils.global_avg_pool(net)
+    if add_and_check_endpoint('global_pool', net) or num_classes is None:
+      return net, end_points
+    net = slim.dropout(net, hparams.dense_dropout_keep_prob, scope='dropout')
+    logits = slim.fully_connected(net, num_classes)
+
+    if add_and_check_endpoint('Logits', logits):
+      return net, end_points
+
+    predictions = tf.nn.softmax(logits, name='predictions')
+    if add_and_check_endpoint('Predictions', predictions):
+      return net, end_points
+  return logits, end_points
+
+
+class NasnetModel(model.CNNModel):
+  """Nasnet model configuration."""
+
+  def __init__(self, params=None):
+    super(NasnetModel, self).__init__('nasnet', 224, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    tf.logging.info('input_image_shape: {}'.format(cnn.top_layer.shape))
+    cnn.top_layer, _ = build_nasnet_mobile(
+        images=cnn.top_layer,
+        is_training=cnn.phase_train,
+        data_format=cnn.data_format)
+    cnn.top_size = cnn.top_layer.shape[-1].value
+
+
+class NasnetLargeModel(model.CNNModel):
+  """Nasnet model configuration."""
+
+  def __init__(self, params=None):
+    super(NasnetLargeModel, self).__init__(
+        'nasnet', 331, 16, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    tf.logging.info('input_image_shape: {}'.format(cnn.top_layer.shape))
+    cnn.top_layer, _ = build_nasnet_large(
+        images=cnn.top_layer,
+        is_training=cnn.phase_train,
+        data_format=cnn.data_format)
+    cnn.top_size = cnn.top_layer.shape[-1].value
+
+
+class NasnetCifarModel(model.CNNModel):
+  """Nasnet cifar model configuration."""
+
+  def __init__(self, params=None):
+    super(NasnetCifarModel, self).__init__(
+        'nasnet', 32, 32, 0.025, params=params)
+
+  def add_inference(self, cnn):
+    tf.logging.info('input_image_shape: {}'.format(cnn.top_layer.shape))
+    cnn.top_layer, _ = build_nasnet_cifar(
+        images=cnn.top_layer,
+        is_training=cnn.phase_train,
+        data_format=cnn.data_format)
+    cnn.top_size = cnn.top_layer.shape[-1].value
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_test.py b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e3bc3776e992c2688e6dd9dfeddbbf7835c6774
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_test.py
@@ -0,0 +1,289 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for nasnet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+from models.tf1_only import nasnet_model as nasnet
+from tensorflow.contrib import slim
+
+
+class NASNetTest(tf.test.TestCase):
+
+  def testBuildLogitsCifarModel(self):
+    batch_size = 5
+    height, width = 32, 32
+    num_classes = 10
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_cifar_arg_scope()):
+      logits, end_points = nasnet.build_nasnet_cifar(inputs, num_classes)
+    auxlogits = end_points['AuxLogits']
+    predictions = end_points['Predictions']
+    self.assertListEqual(auxlogits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertListEqual(predictions.get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testBuildLogitsMobileModel(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()):
+      logits, end_points = nasnet.build_nasnet_mobile(inputs, num_classes)
+    auxlogits = end_points['AuxLogits']
+    predictions = end_points['Predictions']
+    self.assertListEqual(auxlogits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertListEqual(predictions.get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testBuildLogitsLargeModel(self):
+    batch_size = 5
+    height, width = 331, 331
+    num_classes = 1000
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_large_arg_scope()):
+      logits, end_points = nasnet.build_nasnet_large(inputs, num_classes)
+    auxlogits = end_points['AuxLogits']
+    predictions = end_points['Predictions']
+    self.assertListEqual(auxlogits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertListEqual(predictions.get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testBuildPreLogitsCifarModel(self):
+    batch_size = 5
+    height, width = 32, 32
+    num_classes = None
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_cifar_arg_scope()):
+      net, end_points = nasnet.build_nasnet_cifar(inputs, num_classes)
+    self.assertNotIn('AuxLogits', end_points)
+    self.assertNotIn('Predictions', end_points)
+    self.assertTrue(net.op.name.startswith('final_layer/Mean'))
+    self.assertListEqual(net.get_shape().as_list(), [batch_size, 768])
+
+  def testBuildPreLogitsMobileModel(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = None
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()):
+      net, end_points = nasnet.build_nasnet_mobile(inputs, num_classes)
+    self.assertNotIn('AuxLogits', end_points)
+    self.assertNotIn('Predictions', end_points)
+    self.assertTrue(net.op.name.startswith('final_layer/Mean'))
+    self.assertListEqual(net.get_shape().as_list(), [batch_size, 1056])
+
+  def testBuildPreLogitsLargeModel(self):
+    batch_size = 5
+    height, width = 331, 331
+    num_classes = None
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_large_arg_scope()):
+      net, end_points = nasnet.build_nasnet_large(inputs, num_classes)
+    self.assertNotIn('AuxLogits', end_points)
+    self.assertNotIn('Predictions', end_points)
+    self.assertTrue(net.op.name.startswith('final_layer/Mean'))
+    self.assertListEqual(net.get_shape().as_list(), [batch_size, 4032])
+
+  def testAllEndPointsShapesCifarModel(self):
+    batch_size = 5
+    height, width = 32, 32
+    num_classes = 10
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_cifar_arg_scope()):
+      _, end_points = nasnet.build_nasnet_cifar(inputs, num_classes)
+    endpoints_shapes = {'Stem': [batch_size, 32, 32, 96],
+                        'Cell_0': [batch_size, 32, 32, 192],
+                        'Cell_1': [batch_size, 32, 32, 192],
+                        'Cell_2': [batch_size, 32, 32, 192],
+                        'Cell_3': [batch_size, 32, 32, 192],
+                        'Cell_4': [batch_size, 32, 32, 192],
+                        'Cell_5': [batch_size, 32, 32, 192],
+                        'Cell_6': [batch_size, 16, 16, 384],
+                        'Cell_7': [batch_size, 16, 16, 384],
+                        'Cell_8': [batch_size, 16, 16, 384],
+                        'Cell_9': [batch_size, 16, 16, 384],
+                        'Cell_10': [batch_size, 16, 16, 384],
+                        'Cell_11': [batch_size, 16, 16, 384],
+                        'Cell_12': [batch_size, 8, 8, 768],
+                        'Cell_13': [batch_size, 8, 8, 768],
+                        'Cell_14': [batch_size, 8, 8, 768],
+                        'Cell_15': [batch_size, 8, 8, 768],
+                        'Cell_16': [batch_size, 8, 8, 768],
+                        'Cell_17': [batch_size, 8, 8, 768],
+                        'Reduction_Cell_0': [batch_size, 16, 16, 256],
+                        'Reduction_Cell_1': [batch_size, 8, 8, 512],
+                        'global_pool': [batch_size, 768],
+                        # Logits and predictions
+                        'AuxLogits': [batch_size, num_classes],
+                        'Logits': [batch_size, num_classes],
+                        'Predictions': [batch_size, num_classes]}
+    self.assertCountEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      tf.logging.info('Endpoint name: {}'.format(endpoint_name))
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertIn(endpoint_name, end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testAllEndPointsShapesMobileModel(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()):
+      _, end_points = nasnet.build_nasnet_mobile(inputs, num_classes)
+    endpoints_shapes = {'Stem': [batch_size, 28, 28, 88],
+                        'Cell_0': [batch_size, 28, 28, 264],
+                        'Cell_1': [batch_size, 28, 28, 264],
+                        'Cell_2': [batch_size, 28, 28, 264],
+                        'Cell_3': [batch_size, 28, 28, 264],
+                        'Cell_4': [batch_size, 14, 14, 528],
+                        'Cell_5': [batch_size, 14, 14, 528],
+                        'Cell_6': [batch_size, 14, 14, 528],
+                        'Cell_7': [batch_size, 14, 14, 528],
+                        'Cell_8': [batch_size, 7, 7, 1056],
+                        'Cell_9': [batch_size, 7, 7, 1056],
+                        'Cell_10': [batch_size, 7, 7, 1056],
+                        'Cell_11': [batch_size, 7, 7, 1056],
+                        'Reduction_Cell_0': [batch_size, 14, 14, 352],
+                        'Reduction_Cell_1': [batch_size, 7, 7, 704],
+                        'global_pool': [batch_size, 1056],
+                        # Logits and predictions
+                        'AuxLogits': [batch_size, num_classes],
+                        'Logits': [batch_size, num_classes],
+                        'Predictions': [batch_size, num_classes]}
+    self.assertCountEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      tf.logging.info('Endpoint name: {}'.format(endpoint_name))
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertIn(endpoint_name, end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testAllEndPointsShapesLargeModel(self):
+    batch_size = 5
+    height, width = 331, 331
+    num_classes = 1000
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    with slim.arg_scope(nasnet.nasnet_large_arg_scope()):
+      _, end_points = nasnet.build_nasnet_large(inputs, num_classes)
+    endpoints_shapes = {'Stem': [batch_size, 42, 42, 336],
+                        'Cell_0': [batch_size, 42, 42, 1008],
+                        'Cell_1': [batch_size, 42, 42, 1008],
+                        'Cell_2': [batch_size, 42, 42, 1008],
+                        'Cell_3': [batch_size, 42, 42, 1008],
+                        'Cell_4': [batch_size, 42, 42, 1008],
+                        'Cell_5': [batch_size, 42, 42, 1008],
+                        'Cell_6': [batch_size, 21, 21, 2016],
+                        'Cell_7': [batch_size, 21, 21, 2016],
+                        'Cell_8': [batch_size, 21, 21, 2016],
+                        'Cell_9': [batch_size, 21, 21, 2016],
+                        'Cell_10': [batch_size, 21, 21, 2016],
+                        'Cell_11': [batch_size, 21, 21, 2016],
+                        'Cell_12': [batch_size, 11, 11, 4032],
+                        'Cell_13': [batch_size, 11, 11, 4032],
+                        'Cell_14': [batch_size, 11, 11, 4032],
+                        'Cell_15': [batch_size, 11, 11, 4032],
+                        'Cell_16': [batch_size, 11, 11, 4032],
+                        'Cell_17': [batch_size, 11, 11, 4032],
+                        'Reduction_Cell_0': [batch_size, 21, 21, 1344],
+                        'Reduction_Cell_1': [batch_size, 11, 11, 2688],
+                        'global_pool': [batch_size, 4032],
+                        # Logits and predictions
+                        'AuxLogits': [batch_size, num_classes],
+                        'Logits': [batch_size, num_classes],
+                        'Predictions': [batch_size, num_classes]}
+    self.assertCountEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      tf.logging.info('Endpoint name: {}'.format(endpoint_name))
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertIn(endpoint_name, end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testVariablesSetDeviceMobileModel(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    tf.train.create_global_step()
+    # Force all Variables to reside on the device.
+    with tf.variable_scope('on_cpu'), tf.device('/cpu:0'):
+      with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()):
+        nasnet.build_nasnet_mobile(inputs, num_classes)
+    with tf.variable_scope('on_gpu'), tf.device('/gpu:0'):
+      with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()):
+        nasnet.build_nasnet_mobile(inputs, num_classes)
+    for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='on_cpu'):
+      self.assertDeviceEqual(v.device, '/cpu:0')
+    for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='on_gpu'):
+      self.assertDeviceEqual(v.device, '/gpu:0')
+
+  def testUnknownBatchSizeMobileModel(self):
+    batch_size = 1
+    height, width = 224, 224
+    num_classes = 1000
+    with self.test_session() as sess:
+      inputs = tf.placeholder(tf.float32, (None, height, width, 3))
+      with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()):
+        logits, _ = nasnet.build_nasnet_mobile(inputs, num_classes)
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [None, num_classes])
+      images = tf.random_uniform((batch_size, height, width, 3))
+      sess.run(tf.global_variables_initializer())
+      output = sess.run(logits, {inputs: images.eval()})
+      self.assertEqual(output.shape, (batch_size, num_classes))
+
+  def testEvaluationMobileModel(self):
+    batch_size = 2
+    height, width = 224, 224
+    num_classes = 1000
+    with self.test_session() as sess:
+      eval_inputs = tf.random_uniform((batch_size, height, width, 3))
+      with slim.arg_scope(nasnet.nasnet_mobile_arg_scope()):
+        logits, _ = nasnet.build_nasnet_mobile(eval_inputs,
+                                               num_classes,
+                                               is_training=False)
+      predictions = tf.argmax(logits, 1)
+      sess.run(tf.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEqual(output.shape, (batch_size,))
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_utils.py b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b280b3ea85c35ca9f804ebecbf300d98bda6baa
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/tf1_only/nasnet_utils.py
@@ -0,0 +1,492 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A custom module for some common operations used by NASNet.
+
+Functions exposed in this file:
+- calc_reduction_layers
+- get_channel_index
+- get_channel_dim
+- global_avg_pool
+- factorized_reduction
+- drop_path
+
+Classes exposed in this file:
+- NasNetABaseCell
+- NasNetANormalCell
+- NasNetAReductionCell
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.contrib import framework as contrib_framework
+from tensorflow.contrib import slim
+
+arg_scope = contrib_framework.arg_scope
+
+DATA_FORMAT_NCHW = 'NCHW'
+DATA_FORMAT_NHWC = 'NHWC'
+INVALID = 'null'
+
+
+def calc_reduction_layers(num_cells, num_reduction_layers):
+  """Figure out what layers should have reductions."""
+  reduction_layers = []
+  for pool_num in range(1, num_reduction_layers + 1):
+    layer_num = (float(pool_num) / (num_reduction_layers + 1)) * num_cells
+    layer_num = int(layer_num)
+    reduction_layers.append(layer_num)
+  return reduction_layers
+
+
+@contrib_framework.add_arg_scope
+def get_channel_index(data_format=INVALID):
+  assert data_format != INVALID
+  axis = 3 if data_format == 'NHWC' else 1
+  return axis
+
+
+@contrib_framework.add_arg_scope
+def get_channel_dim(shape, data_format=INVALID):
+  assert data_format != INVALID
+  assert len(shape) == 4
+  if data_format == 'NHWC':
+    return int(shape[3])
+  elif data_format == 'NCHW':
+    return int(shape[1])
+  else:
+    raise ValueError('Not a valid data_format', data_format)
+
+
+@contrib_framework.add_arg_scope
+def global_avg_pool(x, data_format=INVALID):
+  """Average pool away the height and width spatial dimensions of x."""
+  assert data_format != INVALID
+  assert data_format in ['NHWC', 'NCHW']
+  assert x.shape.ndims == 4
+  if data_format == 'NHWC':
+    return tf.reduce_mean(x, [1, 2])
+  else:
+    return tf.reduce_mean(x, [2, 3])
+
+
+@contrib_framework.add_arg_scope
+def factorized_reduction(net, output_filters, stride, data_format=INVALID):
+  """Reduces the shape of net without information loss due to striding."""
+  assert output_filters % 2 == 0, (
+      'Need even number of filters when using this factorized reduction.')
+  assert data_format != INVALID
+  if stride == 1:
+    net = slim.conv2d(net, output_filters, 1, scope='path_conv')
+    net = slim.batch_norm(net, scope='path_bn')
+    return net
+  if data_format == 'NHWC':
+    stride_spec = [1, stride, stride, 1]
+  else:
+    stride_spec = [1, 1, stride, stride]
+
+  # Skip path 1
+  path1 = tf.nn.avg_pool(
+      net, [1, 1, 1, 1], stride_spec, 'VALID', data_format=data_format)
+  path1 = slim.conv2d(path1, int(output_filters / 2), 1, scope='path1_conv')
+
+  # Skip path 2
+  # First pad with 0's on the right and bottom, then shift the filter to
+  # include those 0's that were added.
+  if data_format == 'NHWC':
+    pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]]
+    path2 = tf.pad(net, pad_arr)[:, 1:, 1:, :]
+    concat_axis = 3
+  else:
+    pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]]
+    path2 = tf.pad(net, pad_arr)[:, :, 1:, 1:]
+    concat_axis = 1
+
+  path2 = tf.nn.avg_pool(
+      path2, [1, 1, 1, 1], stride_spec, 'VALID', data_format=data_format)
+  path2 = slim.conv2d(path2, int(output_filters / 2), 1, scope='path2_conv')
+
+  # Concat and apply BN
+  final_path = tf.concat(values=[path1, path2], axis=concat_axis)
+  final_path = slim.batch_norm(final_path, scope='final_path_bn')
+  return final_path
+
+
+@contrib_framework.add_arg_scope
+def drop_path(net, keep_prob, is_training=True):
+  """Drops out a whole example hiddenstate with the specified probability."""
+  if is_training:
+    batch_size = tf.shape(net)[0]
+    noise_shape = [batch_size, 1, 1, 1]
+    keep_prob = tf.cast(keep_prob, dtype=net.dtype)
+    random_tensor = keep_prob
+    random_tensor += tf.random_uniform(noise_shape, dtype=net.dtype)
+    binary_tensor = tf.floor(random_tensor)
+    net = tf.div(net, keep_prob) * binary_tensor
+  return net
+
+
+def _operation_to_filter_shape(operation):
+  splitted_operation = operation.split('x')
+  filter_shape = int(splitted_operation[0][-1])
+  assert filter_shape == int(
+      splitted_operation[1][0]), 'Rectangular filters not supported.'
+  return filter_shape
+
+
+def _operation_to_num_layers(operation):
+  splitted_operation = operation.split('_')
+  if 'x' in splitted_operation[-1]:
+    return 1
+  return int(splitted_operation[-1])
+
+
+def _operation_to_info(operation):
+  """Takes in operation name and returns meta information.
+
+  An example would be 'separable_3x3_4' -> (3, 4).
+
+  Args:
+    operation: String that corresponds to convolution operation.
+
+  Returns:
+    Tuple of (filter shape, num layers).
+  """
+  num_layers = _operation_to_num_layers(operation)
+  filter_shape = _operation_to_filter_shape(operation)
+  return num_layers, filter_shape
+
+
+def _stacked_separable_conv(net, stride, operation, filter_size):
+  """Takes in an operations and parses it to the correct sep operation."""
+  num_layers, kernel_size = _operation_to_info(operation)
+  net_type = net.dtype
+  net = tf.cast(net, tf.float32) if net_type == tf.float16 else net
+
+  for layer_num in range(num_layers - 1):
+    net = tf.nn.relu(net)
+    net = slim.separable_conv2d(
+        net,
+        filter_size,
+        kernel_size,
+        depth_multiplier=1,
+        scope='separable_{0}x{0}_{1}'.format(kernel_size, layer_num + 1),
+        stride=stride)
+    net = slim.batch_norm(
+        net, scope='bn_sep_{0}x{0}_{1}'.format(kernel_size, layer_num + 1))
+    stride = 1
+  net = tf.nn.relu(net)
+  net = slim.separable_conv2d(
+      net,
+      filter_size,
+      kernel_size,
+      depth_multiplier=1,
+      scope='separable_{0}x{0}_{1}'.format(kernel_size, num_layers),
+      stride=stride)
+  net = slim.batch_norm(
+      net, scope='bn_sep_{0}x{0}_{1}'.format(kernel_size, num_layers))
+  net = tf.cast(net, net_type)
+  return net
+
+
+def _operation_to_pooling_type(operation):
+  """Takes in the operation string and returns the pooling type."""
+  splitted_operation = operation.split('_')
+  return splitted_operation[0]
+
+
+def _operation_to_pooling_shape(operation):
+  """Takes in the operation string and returns the pooling kernel shape."""
+  splitted_operation = operation.split('_')
+  shape = splitted_operation[-1]
+  assert 'x' in shape
+  filter_height, filter_width = shape.split('x')
+  assert filter_height == filter_width
+  return int(filter_height)
+
+
+def _operation_to_pooling_info(operation):
+  """Parses the pooling operation string to return its type and shape."""
+  pooling_type = _operation_to_pooling_type(operation)
+  pooling_shape = _operation_to_pooling_shape(operation)
+  return pooling_type, pooling_shape
+
+
+def _pooling(net, stride, operation):
+  """Parses operation and performs the correct pooling operation on net."""
+  padding = 'SAME'
+  pooling_type, pooling_shape = _operation_to_pooling_info(operation)
+  if pooling_type == 'avg':
+    net = slim.avg_pool2d(net, pooling_shape, stride=stride, padding=padding)
+  elif pooling_type == 'max':
+    net = slim.max_pool2d(net, pooling_shape, stride=stride, padding=padding)
+  else:
+    raise NotImplementedError('Unimplemented pooling type: ', pooling_type)
+  return net
+
+
+class NasNetABaseCell(object):  # pylint: disable=g-classes-have-attributes
+  """NASNet Cell class that is used as a 'layer' in image architectures.
+
+  Args:
+    num_conv_filters: The number of filters for each convolution operation.
+    operations: List of operations that are performed in the NASNet Cell in
+      order.
+    used_hiddenstates: Binary array that signals if the hiddenstate was used
+      within the cell. This is used to determine what outputs of the cell
+      should be concatenated together.
+    hiddenstate_indices: Determines what hiddenstates should be combined
+      together with the specified operations to create the NASNet cell.
+  """
+
+  def __init__(self, num_conv_filters, operations, used_hiddenstates,
+               hiddenstate_indices, drop_path_keep_prob, total_num_cells,
+               total_training_steps):
+    self._num_conv_filters = num_conv_filters
+    self._operations = operations
+    self._used_hiddenstates = used_hiddenstates
+    self._hiddenstate_indices = hiddenstate_indices
+    self._drop_path_keep_prob = drop_path_keep_prob
+    self._total_num_cells = total_num_cells
+    self._total_training_steps = total_training_steps
+
+  def _reduce_prev_layer(self, prev_layer, curr_layer):
+    """Matches dimension of prev_layer to the curr_layer."""
+    # Set the prev layer to the current layer if it is none
+    if prev_layer is None:
+      return curr_layer
+    curr_num_filters = self._filter_size
+    prev_num_filters = get_channel_dim(prev_layer.shape)
+    curr_filter_shape = int(curr_layer.shape[2])
+    prev_filter_shape = int(prev_layer.shape[2])
+    if curr_filter_shape != prev_filter_shape:
+      prev_layer = tf.nn.relu(prev_layer)
+      prev_layer = factorized_reduction(prev_layer, curr_num_filters, stride=2)
+    elif curr_num_filters != prev_num_filters:
+      prev_layer = tf.nn.relu(prev_layer)
+      prev_layer = slim.conv2d(
+          prev_layer, curr_num_filters, 1, scope='prev_1x1')
+      prev_layer = slim.batch_norm(prev_layer, scope='prev_bn')
+    return prev_layer
+
+  def _cell_base(self, net, prev_layer):
+    """Runs the beginning of the conv cell before the predicted ops are run."""
+    num_filters = self._filter_size
+
+    # Check to be sure prev layer stuff is setup correctly
+    prev_layer = self._reduce_prev_layer(prev_layer, net)
+
+    net = tf.nn.relu(net)
+    net = slim.conv2d(net, num_filters, 1, scope='1x1')
+    net = slim.batch_norm(net, scope='beginning_bn')
+    split_axis = get_channel_index()
+    net = tf.split(axis=split_axis, num_or_size_splits=1, value=net)
+    for split in net:
+      assert int(split.shape[split_axis] == int(
+          self._num_conv_filters * self._filter_scaling))
+    net.append(prev_layer)
+    return net
+
+  def __call__(self,
+               net,
+               scope=None,
+               filter_scaling=1,
+               stride=1,
+               prev_layer=None,
+               cell_num=-1):
+    """Runs the conv cell."""
+    self._cell_num = cell_num
+    self._filter_scaling = filter_scaling
+    self._filter_size = int(self._num_conv_filters * filter_scaling)
+
+    i = 0
+    with tf.variable_scope(scope):
+      net = self._cell_base(net, prev_layer)
+      for iteration in range(5):
+        with tf.variable_scope('comb_iter_{}'.format(iteration)):
+          left_hiddenstate_idx, right_hiddenstate_idx = (
+              self._hiddenstate_indices[i], self._hiddenstate_indices[i + 1])
+          original_input_left = left_hiddenstate_idx < 2
+          original_input_right = right_hiddenstate_idx < 2
+          h1 = net[left_hiddenstate_idx]
+          h2 = net[right_hiddenstate_idx]
+
+          operation_left = self._operations[i]
+          operation_right = self._operations[i + 1]
+          i += 2
+          # Apply conv operations
+          with tf.variable_scope('left'):
+            h1 = self._apply_conv_operation(h1, operation_left, stride,
+                                            original_input_left)
+          with tf.variable_scope('right'):
+            h2 = self._apply_conv_operation(h2, operation_right, stride,
+                                            original_input_right)
+
+          # Combine hidden states using 'add'.
+          with tf.variable_scope('combine'):
+            h = h1 + h2
+
+          # Add hiddenstate to the list of hiddenstates we can choose from
+          net.append(h)
+
+      with tf.variable_scope('cell_output'):
+        net = self._combine_unused_states(net)
+
+      return net
+
+  def _apply_conv_operation(self, net, operation, stride,
+                            is_from_original_input):
+    """Applies the predicted conv operation to net."""
+    # Dont stride if this is not one of the original hiddenstates
+    if stride > 1 and not is_from_original_input:
+      stride = 1
+    input_filters = get_channel_dim(net.shape)
+    filter_size = self._filter_size
+    if 'separable' in operation:
+      net = _stacked_separable_conv(net, stride, operation, filter_size)
+    elif operation in ['none']:
+      # Check if a stride is needed, then use a strided 1x1 here
+      if stride > 1 or (input_filters != filter_size):
+        net = tf.nn.relu(net)
+        net = slim.conv2d(net, filter_size, 1, stride=stride, scope='1x1')
+        net = slim.batch_norm(net, scope='bn_1')
+    elif 'pool' in operation:
+      net = _pooling(net, stride, operation)
+      if input_filters != filter_size:
+        net = slim.conv2d(net, filter_size, 1, stride=1, scope='1x1')
+        net = slim.batch_norm(net, scope='bn_1')
+    else:
+      raise ValueError('Unimplemented operation', operation)
+
+    if operation != 'none':
+      net = self._apply_drop_path(net)
+    return net
+
+  def _combine_unused_states(self, net):
+    """Concatenate the unused hidden states of the cell."""
+    used_hiddenstates = self._used_hiddenstates
+
+    final_height = int(net[-1].shape[2])
+    final_num_filters = get_channel_dim(net[-1].shape)
+    assert len(used_hiddenstates) == len(net)
+    for idx, used_h in enumerate(used_hiddenstates):
+      curr_height = int(net[idx].shape[2])
+      curr_num_filters = get_channel_dim(net[idx].shape)
+
+      # Determine if a reduction should be applied to make the number of
+      # filters match.
+      should_reduce = final_num_filters != curr_num_filters
+      should_reduce = (final_height != curr_height) or should_reduce
+      should_reduce = should_reduce and not used_h
+      if should_reduce:
+        stride = 2 if final_height != curr_height else 1
+        with tf.variable_scope('reduction_{}'.format(idx)):
+          net[idx] = factorized_reduction(net[idx], final_num_filters, stride)
+
+    states_to_combine = ([
+        h for h, is_used in zip(net, used_hiddenstates) if not is_used
+    ])
+
+    # Return the concat of all the states
+    concat_axis = get_channel_index()
+    net = tf.concat(values=states_to_combine, axis=concat_axis)
+    return net
+
+  @contrib_framework.add_arg_scope  # No public API. For internal use only.
+  def _apply_drop_path(self,
+                       net,
+                       current_step=None,
+                       use_summaries=True,
+                       drop_connect_version='v3'):
+    """Apply drop_path regularization.
+
+    Args:
+      net: the Tensor that gets drop_path regularization applied.
+      current_step: a float32 Tensor with the current global_step value,
+        to be divided by hparams.total_training_steps. Usually None, which
+        defaults to tf.train.get_or_create_global_step() properly casted.
+      use_summaries: a Python boolean. If set to False, no summaries are output.
+      drop_connect_version: one of 'v1', 'v2', 'v3', controlling whether
+        the dropout rate is scaled by current_step (v1), layer (v2), or
+        both (v3, the default).
+
+    Returns:
+      The dropped-out value of `net`.
+    """
+    drop_path_keep_prob = self._drop_path_keep_prob
+    if drop_path_keep_prob < 1.0:
+      assert drop_connect_version in ['v1', 'v2', 'v3']
+      if drop_connect_version in ['v2', 'v3']:
+        # Scale keep prob by layer number
+        assert self._cell_num != -1
+        # The added 2 is for the reduction cells
+        num_cells = self._total_num_cells
+        layer_ratio = (self._cell_num + 1) / float(num_cells)
+        if use_summaries:
+          with tf.device('/cpu:0'):
+            tf.summary.scalar('layer_ratio', layer_ratio)
+        drop_path_keep_prob = 1 - layer_ratio * (1 - drop_path_keep_prob)
+      if drop_connect_version in ['v1', 'v3']:
+        # Decrease the keep probability over time
+        if not current_step:
+          current_step = tf.cast(tf.train.get_or_create_global_step(),
+                                 tf.float32)
+        drop_path_burn_in_steps = self._total_training_steps
+        current_ratio = current_step / drop_path_burn_in_steps
+        current_ratio = tf.minimum(1.0, current_ratio)
+        if use_summaries:
+          with tf.device('/cpu:0'):
+            tf.summary.scalar('current_ratio', current_ratio)
+        drop_path_keep_prob = (1 - current_ratio * (1 - drop_path_keep_prob))
+      if use_summaries:
+        with tf.device('/cpu:0'):
+          tf.summary.scalar('drop_path_keep_prob', drop_path_keep_prob)
+      net = drop_path(net, drop_path_keep_prob)
+    return net
+
+
+class NasNetANormalCell(NasNetABaseCell):
+  """NASNetA Normal Cell."""
+
+  def __init__(self, num_conv_filters, drop_path_keep_prob, total_num_cells,
+               total_training_steps):
+    operations = [
+        'separable_5x5_2', 'separable_3x3_2', 'separable_5x5_2',
+        'separable_3x3_2', 'avg_pool_3x3', 'none', 'avg_pool_3x3',
+        'avg_pool_3x3', 'separable_3x3_2', 'none'
+    ]
+    used_hiddenstates = [1, 0, 0, 0, 0, 0, 0]
+    hiddenstate_indices = [0, 1, 1, 1, 0, 1, 1, 1, 0, 0]
+    super(NasNetANormalCell, self).__init__(
+        num_conv_filters, operations, used_hiddenstates, hiddenstate_indices,
+        drop_path_keep_prob, total_num_cells, total_training_steps)
+
+
+class NasNetAReductionCell(NasNetABaseCell):
+  """NASNetA Reduction Cell."""
+
+  def __init__(self, num_conv_filters, drop_path_keep_prob, total_num_cells,
+               total_training_steps):
+    operations = [
+        'separable_5x5_2', 'separable_7x7_2', 'max_pool_3x3', 'separable_7x7_2',
+        'avg_pool_3x3', 'separable_5x5_2', 'none', 'avg_pool_3x3',
+        'separable_3x3_2', 'max_pool_3x3'
+    ]
+    used_hiddenstates = [1, 1, 1, 0, 0, 0, 0]
+    hiddenstate_indices = [0, 1, 0, 1, 0, 1, 3, 2, 2, 0]
+    super(NasNetAReductionCell, self).__init__(
+        num_conv_filters, operations, used_hiddenstates, hiddenstate_indices,
+        drop_path_keep_prob, total_num_cells, total_training_steps)
diff --git a/cv/classification/resnet50/tensorflow/models/tf1_only/ssd_model.py b/cv/classification/resnet50/tensorflow/models/tf1_only/ssd_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d959d5be5ccf2d0197196ef46e113665f06b258
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/tf1_only/ssd_model.py
@@ -0,0 +1,683 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""SSD300 Model Configuration.
+
+References:
+  Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+  Cheng-Yang Fu, Alexander C. Berg
+  SSD: Single Shot MultiBox Detector
+  arXiv:1512.02325
+
+Ported from MLPerf reference implementation:
+  https://github.com/mlperf/reference/tree/ssd/single_stage_detector/ssd
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+import re
+import threading
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+import constants
+import mlperf
+import ssd_constants
+from cnn_util import log_fn
+from models import model as model_lib
+from models import resnet_model
+from tensorflow.contrib import layers as contrib_layers
+from tensorflow.python.ops import variables
+
+BACKBONE_MODEL_SCOPE_NAME = 'resnet34_backbone'
+
+
+class SSD300Model(model_lib.CNNModel):
+  """Single Shot Multibox Detection (SSD) model for 300x300 image datasets."""
+
+  def __init__(self, label_num=ssd_constants.NUM_CLASSES, batch_size=32,
+               learning_rate=1e-3, backbone='resnet34', params=None):
+    super(SSD300Model, self).__init__('ssd300', 300, batch_size, learning_rate,
+                                      params=params)
+    # For COCO dataset, 80 categories + 1 background = 81 labels
+    self.label_num = label_num
+
+    # Currently only support ResNet-34 as backbone model
+    if backbone != 'resnet34':
+      raise ValueError('Invalid backbone model %s for SSD.' % backbone)
+    mlperf.logger.log(key=mlperf.tags.BACKBONE, value=backbone)
+
+    # Number of channels and default boxes associated with the following layers:
+    #   ResNet34 layer, Conv7, Conv8_2, Conv9_2, Conv10_2, Conv11_2
+    self.out_chan = [256, 512, 512, 256, 256, 256]
+    mlperf.logger.log(key=mlperf.tags.LOC_CONF_OUT_CHANNELS,
+                      value=self.out_chan)
+
+    # Number of default boxes from layers of different scales
+    #   38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
+    self.num_dboxes = [4, 6, 6, 6, 4, 4]
+    mlperf.logger.log(key=mlperf.tags.NUM_DEFAULTS_PER_CELL,
+                      value=self.num_dboxes)
+
+    # TODO(haoyuzhang): in order to correctly restore in replicated mode, need
+    # to create a saver for each tower before graph is finalized. Use variable
+    # manager for better efficiency.
+    self.backbone_savers = []
+
+    # Collected predictions for eval stage. It maps each image id in eval
+    # dataset to a dict containing the following information:
+    #   source_id: raw ID of image
+    #   raw_shape: raw shape of image
+    #   pred_box: encoded box coordinates of prediction
+    #   pred_scores: scores of classes in prediction
+    self.predictions = {}
+
+    # Global step when predictions are collected.
+    self.eval_global_step = 0
+
+    # Average precision. In asynchronous eval mode, this is the latest AP we
+    # get so far and may not be the results at current eval step.
+    self.eval_coco_ap = 0
+
+    # Process, queues, and thread for asynchronous evaluation. When enabled,
+    # create a separate process (async_eval_process) that continuously pull
+    # intermediate results from the predictions queue (a multiprocessing queue),
+    # process them, and push final results into results queue (another
+    # multiprocessing queue). The main thread is responsible to push message
+    # into predictions queue, and start a separate thread to continuously pull
+    # messages from results queue to update final results.
+    # Message in predictions queue should be a tuple of two elements:
+    #    (evaluation step, predictions)
+    # Message in results queue should be a tuple of two elements:
+    #    (evaluation step, final results)
+    self.async_eval_process = None
+    self.async_eval_predictions_queue = None
+    self.async_eval_results_queue = None
+    self.async_eval_results_getter_thread = None
+
+    # The MLPerf reference uses a starting lr of 1e-3 at bs=32.
+    self.base_lr_batch_size = 32
+
+  def skip_final_affine_layer(self):
+    return True
+
+  def gpu_preprocess_nhwc(self, images, phase_train=True):
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation ; To evaluate using COCO'
+                        'metric, download and install Python COCO API from'
+                        'https://github.com/cocodataset/cocoapi')
+
+    if phase_train:
+      images = ssd_dataloader.color_jitter(
+          images, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05)
+      images = ssd_dataloader.normalize_image(images)
+    return images
+
+  def add_backbone_model(self, cnn):
+    # --------------------------------------------------------------------------
+    # Resnet-34 backbone model -- modified for SSD
+    # --------------------------------------------------------------------------
+
+    # Input 300x300, output 150x150
+    cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+
+    resnet34_layers = [3, 4, 6, 3]
+    version = 'v1'
+
+    # ResNet-34 block group 1
+    # Input 150x150, output 75x75
+    for i in range(resnet34_layers[0]):
+      # Last argument forces residual_block to use projection shortcut, even
+      # though the numbers of input and output channels are equal
+      resnet_model.residual_block(cnn, 64, 1, version)
+
+    # ResNet-34 block group 2
+    # Input 75x75, output 38x38
+    for i in range(resnet34_layers[1]):
+      stride = 2 if i == 0 else 1
+      resnet_model.residual_block(cnn, 128, stride, version, i == 0)
+
+    # ResNet-34 block group 3
+    # This block group is modified: first layer uses stride=1 so that the image
+    # size does not change in group of layers
+    # Input 38x38, output 38x38
+    for i in range(resnet34_layers[2]):
+      # The following line is intentionally commented out to differentiate from
+      # the original ResNet-34 model
+      # stride = 2 if i == 0 else 1
+      resnet_model.residual_block(cnn, 256, stride, version, i == 0)
+
+    # ResNet-34 block group 4: removed final block group
+    # The following 3 lines are intentionally commented out to differentiate
+    # from the original ResNet-34 model
+    # for i in range(resnet34_layers[3]):
+    #   stride = 2 if i == 0 else 1
+    #   resnet_model.residual_block(cnn, 512, stride, version, i == 0)
+
+  def add_inference(self, cnn):
+    cnn.use_batch_norm = True
+    cnn.batch_norm_config = {'decay': ssd_constants.BATCH_NORM_DECAY,
+                             'epsilon': ssd_constants.BATCH_NORM_EPSILON,
+                             'scale': True}
+
+    with tf.variable_scope(BACKBONE_MODEL_SCOPE_NAME):
+      self.add_backbone_model(cnn)
+
+    # --------------------------------------------------------------------------
+    # SSD additional layers
+    # --------------------------------------------------------------------------
+
+    def add_ssd_layer(cnn, depth, k_size, stride, mode):
+      return cnn.conv(
+          depth,
+          k_size,
+          k_size,
+          stride,
+          stride,
+          mode=mode,
+          use_batch_norm=False,
+          kernel_initializer=contrib_layers.xavier_initializer())
+
+    # Activations for feature maps of different layers
+    self.activations = [cnn.top_layer]
+    # Conv7_1, Conv7_2
+    # Input 38x38, output 19x19
+    add_ssd_layer(cnn, 256, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))
+
+    # Conv8_1, Conv8_2
+    # Input 19x19, output 10x10
+    add_ssd_layer(cnn, 256, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))
+
+    # Conv9_1, Conv9_2
+    # Input 10x10, output 5x5
+    add_ssd_layer(cnn, 128, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 256, 3, 2, 'same'))
+
+    # Conv10_1, Conv10_2
+    # Input 5x5, output 3x3
+    add_ssd_layer(cnn, 128, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))
+
+    # Conv11_1, Conv11_2
+    # Input 3x3, output 1x1
+    add_ssd_layer(cnn, 128, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))
+
+    self.loc = []
+    self.conf = []
+
+    for nd, ac, oc in zip(self.num_dboxes, self.activations, self.out_chan):
+      l = cnn.conv(
+          nd * 4,
+          3,
+          3,
+          1,
+          1,
+          input_layer=ac,
+          num_channels_in=oc,
+          activation=None,
+          use_batch_norm=False,
+          kernel_initializer=contrib_layers.xavier_initializer())
+      scale = l.get_shape()[-1]
+      # shape = [batch_size, nd * 4, scale, scale]
+      l = tf.reshape(l, [self.batch_size, nd, 4, scale, scale])
+      # shape = [batch_size, nd, 4, scale, scale]
+      l = tf.transpose(l, [0, 1, 3, 4, 2])
+      # shape = [batch_size, nd, scale, scale, 4]
+      self.loc.append(tf.reshape(l, [self.batch_size, -1, 4]))
+      # shape = [batch_size, nd * scale * scale, 4]
+
+      c = cnn.conv(
+          nd * self.label_num,
+          3,
+          3,
+          1,
+          1,
+          input_layer=ac,
+          num_channels_in=oc,
+          activation=None,
+          use_batch_norm=False,
+          kernel_initializer=contrib_layers.xavier_initializer())
+      # shape = [batch_size, nd * label_num, scale, scale]
+      c = tf.reshape(c, [self.batch_size, nd, self.label_num, scale, scale])
+      # shape = [batch_size, nd, label_num, scale, scale]
+      c = tf.transpose(c, [0, 1, 3, 4, 2])
+      # shape = [batch_size, nd, scale, scale, label_num]
+      self.conf.append(tf.reshape(c, [self.batch_size, -1, self.label_num]))
+      # shape = [batch_size, nd * scale * scale, label_num]
+
+    # Shape of locs: [batch_size, NUM_SSD_BOXES, 4]
+    # Shape of confs: [batch_size, NUM_SSD_BOXES, label_num]
+    locs, confs = tf.concat(self.loc, 1), tf.concat(self.conf, 1)
+
+    # Pack location and confidence outputs into a single output layer
+    # Shape of logits: [batch_size, NUM_SSD_BOXES, 4+label_num]
+    logits = tf.concat([locs, confs], 2)
+
+    cnn.top_layer = logits
+    cnn.top_size = 4 + self.label_num
+
+    return cnn.top_layer
+
+  def get_learning_rate(self, global_step, batch_size):
+    rescaled_lr = self.get_scaled_base_learning_rate(batch_size)
+    # Defined in MLPerf reference model
+    boundaries = [160000, 200000]
+    boundaries = [b * self.base_lr_batch_size // batch_size for b in boundaries]
+    decays = [1, 0.1, 0.01]
+    learning_rates = [rescaled_lr * d for d in decays]
+    lr = tf.train.piecewise_constant(global_step, boundaries, learning_rates)
+    warmup_steps = int(118287 / batch_size * 5)
+    warmup_lr = (
+        rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast(
+            warmup_steps, tf.float32))
+    return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
+
+  def get_scaled_base_learning_rate(self, batch_size):
+    """Calculates base learning rate for creating lr schedule.
+
+    In replicated mode, gradients are summed rather than averaged which, with
+    the sgd and momentum optimizers, increases the effective learning rate by
+    lr * num_gpus. Dividing the base lr by num_gpus negates the increase.
+
+    Args:
+      batch_size: Total batch-size.
+
+    Returns:
+      Base learning rate to use to create lr schedule.
+    """
+    base_lr = self.learning_rate
+    if self.params.variable_update == 'replicated':
+      base_lr = self.learning_rate / self.params.num_gpus
+    scaled_lr = base_lr * (batch_size / self.base_lr_batch_size)
+    return scaled_lr
+
+  def _collect_backbone_vars(self):
+    backbone_vars = tf.get_collection(
+        tf.GraphKeys.GLOBAL_VARIABLES, scope='.*'+ BACKBONE_MODEL_SCOPE_NAME)
+    var_list = {}
+
+    # Assume variables in the checkpoint are following the naming convention of
+    # a model checkpoint trained with TF official model
+    # TODO(haoyuzhang): the following variable name parsing is hacky and easy
+    # to break if there is change in naming convention of either benchmarks or
+    # official models.
+    for v in backbone_vars:
+      # conv2d variable example (model <-- checkpoint):
+      #   v/cg/conv24/conv2d/kernel:0 <-- conv2d_24/kernel
+      if 'conv2d' in v.name:
+        re_match = re.search(r'conv(\d+)/conv2d/(.+):', v.name)
+        if re_match:
+          layer_id = int(re_match.group(1))
+          param_name = re_match.group(2)
+          vname_in_ckpt = self._var_name_in_official_model_ckpt(
+              'conv2d', layer_id, param_name)
+          var_list[vname_in_ckpt] = v
+
+      # batchnorm varariable example:
+      #   v/cg/conv24/batchnorm25/gamma:0 <-- batch_normalization_25/gamma
+      elif 'batchnorm' in v.name:
+        re_match = re.search(r'batchnorm(\d+)/(.+):', v.name)
+        if re_match:
+          layer_id = int(re_match.group(1))
+          param_name = re_match.group(2)
+          vname_in_ckpt = self._var_name_in_official_model_ckpt(
+              'batch_normalization', layer_id, param_name)
+          var_list[vname_in_ckpt] = v
+
+    return var_list
+
+  def _var_name_in_official_model_ckpt(self, layer_name, layer_id, param_name):
+    """Return variable names according to convention in TF official models."""
+    vname_in_ckpt = layer_name
+    if layer_id > 0:
+      vname_in_ckpt += '_' + str(layer_id)
+    vname_in_ckpt += '/' + param_name
+    return vname_in_ckpt
+
+  def loss_function(self, inputs, build_network_result):
+    logits = build_network_result.logits
+
+    # Unpack model output back to locations and confidence scores of predictions
+    # Shape of pred_loc: [batch_size, NUM_SSD_BOXES, 4]
+    # Shape of pred_label: [batch_size, NUM_SSD_BOXES, label_num]
+    pred_loc, pred_label = tf.split(logits, [4, self.label_num], 2)
+
+    # Shape of gt_loc: [batch_size, NUM_SSD_BOXES, 4]
+    # Shape of gt_label: [batch_size, NUM_SSD_BOXES, 1]
+    # Shape of num_gt: [batch_size]
+    _, gt_loc, gt_label, num_gt = inputs
+    gt_label = tf.cast(gt_label, tf.int32)
+
+    box_loss = self._localization_loss(pred_loc, gt_loc, gt_label, num_gt)
+    class_loss = self._classification_loss(pred_label, gt_label, num_gt)
+
+    tf.summary.scalar('box_loss', tf.reduce_mean(box_loss))
+    tf.summary.scalar('class_loss', tf.reduce_mean(class_loss))
+    return class_loss + box_loss
+
+  def _localization_loss(self, pred_loc, gt_loc, gt_label, num_matched_boxes):
+    """Computes the localization loss.
+
+    Computes the localization loss using smooth l1 loss.
+    Args:
+      pred_loc: a flatten tensor that includes all predicted locations. The
+        shape is [batch_size, num_anchors, 4].
+      gt_loc: a tensor representing box regression targets in
+        [batch_size, num_anchors, 4].
+      gt_label: a tensor that represents the classification groundtruth targets.
+        The shape is [batch_size, num_anchors, 1].
+      num_matched_boxes: the number of anchors that are matched to a groundtruth
+        targets, used as the loss normalizater. The shape is [batch_size].
+    Returns:
+      box_loss: a float32 representing total box regression loss.
+    """
+    mask = tf.greater(tf.squeeze(gt_label), 0)
+    float_mask = tf.cast(mask, tf.float32)
+
+    smooth_l1 = tf.reduce_sum(tf.losses.huber_loss(
+        gt_loc, pred_loc,
+        reduction=tf.losses.Reduction.NONE
+    ), axis=2)
+    smooth_l1 = tf.multiply(smooth_l1, float_mask)
+    box_loss = tf.reduce_sum(smooth_l1, axis=1)
+
+    return tf.reduce_mean(box_loss / num_matched_boxes)
+
+  def _classification_loss(self, pred_label, gt_label, num_matched_boxes):
+    """Computes the classification loss.
+
+    Computes the classification loss with hard negative mining.
+    Args:
+      pred_label: a flatten tensor that includes all predicted class. The shape
+        is [batch_size, num_anchors, num_classes].
+      gt_label: a tensor that represents the classification groundtruth targets.
+        The shape is [batch_size, num_anchors, 1].
+      num_matched_boxes: the number of anchors that are matched to a groundtruth
+        targets. This is used as the loss normalizater.
+
+    Returns:
+      box_loss: a float32 representing total box regression loss.
+    """
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+        gt_label, pred_label, reduction=tf.losses.Reduction.NONE)
+
+    mask = tf.greater(tf.squeeze(gt_label), 0)
+    float_mask = tf.cast(mask, tf.float32)
+
+    # Hard example mining
+    neg_masked_cross_entropy = cross_entropy * (1 - float_mask)
+    relative_position = tf.argsort(
+        tf.argsort(
+            neg_masked_cross_entropy, direction='DESCENDING'))
+    num_neg_boxes = tf.minimum(
+        tf.to_int32(num_matched_boxes) * ssd_constants.NEGS_PER_POSITIVE,
+        ssd_constants.NUM_SSD_BOXES)
+    top_k_neg_mask = tf.cast(tf.less(
+        relative_position,
+        tf.tile(num_neg_boxes[:, tf.newaxis], (1, ssd_constants.NUM_SSD_BOXES))
+    ), tf.float32)
+
+    class_loss = tf.reduce_sum(
+        tf.multiply(cross_entropy, float_mask + top_k_neg_mask), axis=1)
+
+    return tf.reduce_mean(class_loss / num_matched_boxes)
+
+  def add_backbone_saver(self):
+    # Create saver with mapping from variable names in checkpoint of backbone
+    # model to variables in SSD model
+    backbone_var_list = self._collect_backbone_vars()
+    self.backbone_savers.append(tf.train.Saver(backbone_var_list))
+
+  def load_backbone_model(self, sess, backbone_model_path):
+    for saver in self.backbone_savers:
+      saver.restore(sess, backbone_model_path)
+
+  def get_input_data_types(self, subset):
+    if subset == 'validation':
+      return [self.data_type, tf.float32, tf.float32, tf.float32, tf.int32]
+    return [self.data_type, tf.float32, tf.float32, tf.float32]
+
+  def get_input_shapes(self, subset):
+    """Return encoded tensor shapes for train and eval data respectively."""
+    if subset == 'validation':
+      # Validation data shapes:
+      # 1. images
+      # 2. ground truth locations of boxes
+      # 3. ground truth classes of objects in boxes
+      # 4. source image IDs
+      # 5. raw image shapes
+      return [
+          [self.batch_size, self.image_size, self.image_size, self.depth],
+          [self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 4],
+          [self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 1],
+          [self.batch_size],
+          [self.batch_size, 3],
+      ]
+
+    # Training data shapes:
+    # 1. images
+    # 2. ground truth locations of boxes
+    # 3. ground truth classes of objects in boxes
+    # 4. numbers of objects in images
+    return [
+        [self.batch_size, self.image_size, self.image_size, self.depth],
+        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 4],
+        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 1],
+        [self.batch_size]
+    ]
+
+  def accuracy_function(self, inputs, logits):
+    """Returns the ops to measure the mean precision of the model."""
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+      from object_detection.box_coders import faster_rcnn_box_coder  # pylint: disable=g-import-not-at-top
+      from object_detection.core import box_coder  # pylint: disable=g-import-not-at-top
+      from object_detection.core import box_list  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation ; To evaluate using COCO'
+                        'metric, download and install Python COCO API from'
+                        'https://github.com/cocodataset/cocoapi')
+
+    # Unpack model output back to locations and confidence scores of predictions
+    # pred_locs: relative locations (coordinates) of objects in all SSD boxes
+    # shape: [batch_size, NUM_SSD_BOXES, 4]
+    # pred_labels: confidence scores of objects being of all categories
+    # shape: [batch_size, NUM_SSD_BOXES, label_num]
+    pred_locs, pred_labels = tf.split(logits, [4, self.label_num], 2)
+
+    ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
+        scale_factors=ssd_constants.BOX_CODER_SCALES)
+    anchors = box_list.BoxList(
+        tf.convert_to_tensor(ssd_dataloader.DefaultBoxes()('ltrb')))
+    pred_boxes = box_coder.batch_decode(
+        encoded_boxes=pred_locs, box_coder=ssd_box_coder, anchors=anchors)
+
+    pred_scores = tf.nn.softmax(pred_labels, axis=2)
+
+    # TODO(haoyuzhang): maybe use `gt_boxes` and `gt_classes` for visualization.
+    _, gt_boxes, gt_classes, source_id, raw_shape = inputs  # pylint: disable=unused-variable
+
+    return {
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
+         ssd_constants.PRED_BOXES): pred_boxes,
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
+         ssd_constants.PRED_SCORES): pred_scores,
+        # TODO(haoyuzhang): maybe use these values for visualization.
+        # constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_boxes': gt_boxes,
+        # constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_classes': gt_classes,
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
+         ssd_constants.SOURCE_ID): source_id,
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
+         ssd_constants.RAW_SHAPE): raw_shape
+    }
+
+  def postprocess(self, results):
+    """Postprocess results returned from model."""
+    try:
+      import coco_metric  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation ; To evaluate using COCO'
+                        'metric, download and install Python COCO API from'
+                        'https://github.com/cocodataset/cocoapi')
+
+    pred_boxes = results[ssd_constants.PRED_BOXES]
+    pred_scores = results[ssd_constants.PRED_SCORES]
+    # TODO(haoyuzhang): maybe use these values for visualization.
+    # gt_boxes = results['gt_boxes']
+    # gt_classes = results['gt_classes']
+    source_id = results[ssd_constants.SOURCE_ID]
+    raw_shape = results[ssd_constants.RAW_SHAPE]
+
+    # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due
+    # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting
+    # `num_eval_epochs` to 1 is not enough and will often miss some images. We
+    # expect user to set `num_eval_epochs` to >1, which will leave some unused
+    # images from previous steps in `predictions`. Here we check if we are doing
+    # eval at a new global step.
+    if results['global_step'] > self.eval_global_step:
+      self.eval_global_step = results['global_step']
+      self.predictions.clear()
+
+    for i, sid in enumerate(source_id):
+      self.predictions[int(sid)] = {
+          ssd_constants.PRED_BOXES: pred_boxes[i],
+          ssd_constants.PRED_SCORES: pred_scores[i],
+          ssd_constants.SOURCE_ID: source_id[i],
+          ssd_constants.RAW_SHAPE: raw_shape[i]
+      }
+
+    # COCO metric calculates mAP only after a full epoch of evaluation. Return
+    # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py.
+    if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES:
+      log_fn('Got results for all {:d} eval examples. Calculate mAP...'.format(
+          ssd_constants.COCO_NUM_VAL_IMAGES))
+
+      annotation_file = os.path.join(self.params.data_dir,
+                                     ssd_constants.ANNOTATION_FILE)
+      # Size of predictions before decoding about 15--30GB, while size after
+      # decoding is 100--200MB. When using async eval mode, decoding takes
+      # 20--30 seconds of main thread time but is necessary to avoid OOM during
+      # inter-process communication.
+      decoded_preds = coco_metric.decode_predictions(self.predictions.values())
+      self.predictions.clear()
+
+      if self.params.collect_eval_results_async:
+        def _eval_results_getter():
+          """Iteratively get eval results from async eval process."""
+          while True:
+            step, eval_results = self.async_eval_results_queue.get()
+            self.eval_coco_ap = eval_results['COCO/AP']
+            mlperf.logger.log_eval_accuracy(
+                self.eval_coco_ap, step, self.batch_size * self.params.num_gpus,
+                ssd_constants.COCO_NUM_TRAIN_IMAGES)
+            if self.reached_target():
+              # Reached target, clear all pending messages in predictions queue
+              # and insert poison pill to stop the async eval process.
+              while not self.async_eval_predictions_queue.empty():
+                self.async_eval_predictions_queue.get()
+              self.async_eval_predictions_queue.put('STOP')
+              break
+
+        if not self.async_eval_process:
+          # Limiting the number of messages in predictions queue to prevent OOM.
+          # Each message (predictions data) can potentially consume a lot of
+          # memory, and normally there should only be few messages in the queue.
+          # If often blocked on this, consider reducing eval frequency.
+          self.async_eval_predictions_queue = multiprocessing.Queue(2)
+          self.async_eval_results_queue = multiprocessing.Queue()
+
+          # Reason to use a Process as opposed to Thread is mainly the
+          # computationally intensive eval runner. Python multithreading is not
+          # truly running in parallel, a runner thread would get significantly
+          # delayed (or alternatively delay the main thread).
+          self.async_eval_process = multiprocessing.Process(
+              target=coco_metric.async_eval_runner,
+              args=(self.async_eval_predictions_queue,
+                    self.async_eval_results_queue,
+                    annotation_file))
+          self.async_eval_process.daemon = True
+          self.async_eval_process.start()
+
+          self.async_eval_results_getter_thread = threading.Thread(
+              target=_eval_results_getter, args=())
+          self.async_eval_results_getter_thread.daemon = True
+          self.async_eval_results_getter_thread.start()
+
+        self.async_eval_predictions_queue.put(
+            (self.eval_global_step, decoded_preds))
+        return {'top_1_accuracy': 0, 'top_5_accuracy': 0.}
+
+      eval_results = coco_metric.compute_map(decoded_preds, annotation_file)
+      self.eval_coco_ap = eval_results['COCO/AP']
+      ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
+      for metric_key, metric_value in eval_results.items():
+        ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value
+      mlperf.logger.log_eval_accuracy(self.eval_coco_ap, self.eval_global_step,
+                                      self.batch_size * self.params.num_gpus,
+                                      ssd_constants.COCO_NUM_TRAIN_IMAGES)
+      return ret
+    log_fn('Got {:d} out of {:d} eval examples.'
+           ' Waiting for the remaining to calculate mAP...'.format(
+               len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES))
+    return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
+
+  def get_synthetic_inputs(self, input_name, nclass):
+    """Generating synthetic data matching real data shape and type."""
+    inputs = tf.random_uniform(
+        self.get_input_shapes('train')[0], dtype=self.data_type)
+    inputs = variables.VariableV1(inputs, trainable=False,
+                                  collections=[tf.GraphKeys.LOCAL_VARIABLES],
+                                  name=input_name)
+    boxes = tf.random_uniform(
+        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 4], dtype=tf.float32)
+    classes = tf.random_uniform(
+        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 1], dtype=tf.float32)
+    nboxes = tf.random_uniform(
+        [self.batch_size], minval=1, maxval=10, dtype=tf.float32)
+    return (inputs, boxes, classes, nboxes)
+
+  def reached_target(self):
+    return (self.params.stop_at_top_1_accuracy and
+            self.eval_coco_ap >= self.params.stop_at_top_1_accuracy)
diff --git a/cv/classification/resnet50/tensorflow/models/trivial_model.py b/cv/classification/resnet50/tensorflow/models/trivial_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ba84d72672c6e3c0903c9af2d0dddecdd7fa2c1
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/trivial_model.py
@@ -0,0 +1,73 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Trivial model configuration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from models import model
+
+
+class TrivialModel(model.CNNModel):
+  """Trivial model configuration."""
+
+  def __init__(self, params=None):
+    super(TrivialModel, self).__init__(
+        'trivial', 224 + 3, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    cnn.reshape([-1, 227 * 227 * 3])
+    cnn.affine(1)
+    cnn.affine(4096)
+
+
+class TrivialCifar10Model(model.CNNModel):
+  """Trivial cifar10 model configuration."""
+
+  def __init__(self, params=None):
+    super(TrivialCifar10Model, self).__init__(
+        'trivial', 32, 32, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    cnn.reshape([-1, 32 * 32 * 3])
+    cnn.affine(1)
+    cnn.affine(4096)
+
+
+class TrivialSSD300Model(model.CNNModel):
+  """Trivial SSD300 model configuration."""
+
+  def __init__(self, params=None):
+    super(TrivialSSD300Model, self).__init__(
+        'trivial', 300, params.batch_size, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    cnn.reshape([-1, 300 * 300 * 3])
+    cnn.affine(1)
+    cnn.affine(4096)
+
+  def get_input_shapes(self, subset):
+    return [[self.batch_size, 300, 300, 3],
+            [self.batch_size, 8732, 4],
+            [self.batch_size, 8732, 1],
+            [self.batch_size]]
+
+  def loss_function(self, inputs, build_network_result):
+    images, _, _, labels = inputs
+    labels = tf.cast(labels, tf.int32)
+    return super(TrivialSSD300Model, self).loss_function(
+        (images, labels), build_network_result)
diff --git a/cv/classification/resnet50/tensorflow/models/vgg_model.py b/cv/classification/resnet50/tensorflow/models/vgg_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..938385c95bbc916ca8677bca232085334a48bbf4
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/models/vgg_model.py
@@ -0,0 +1,83 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Vgg model configuration.
+
+Includes multiple models: vgg11, vgg16, vgg19, corresponding to
+  model A, D, and E in Table 1 of [1].
+
+References:
+[1]  Simonyan, Karen, Andrew Zisserman
+     Very Deep Convolutional Networks for Large-Scale Image Recognition
+     arXiv:1409.1556 (2014)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from models import model
+
+
+def _construct_vgg(cnn, num_conv_layers):
+  """Build vgg architecture from blocks."""
+  assert len(num_conv_layers) == 5
+  for _ in xrange(num_conv_layers[0]):
+    cnn.conv(64, 3, 3)
+  cnn.mpool(2, 2)
+  for _ in xrange(num_conv_layers[1]):
+    cnn.conv(128, 3, 3)
+  cnn.mpool(2, 2)
+  for _ in xrange(num_conv_layers[2]):
+    cnn.conv(256, 3, 3)
+  cnn.mpool(2, 2)
+  for _ in xrange(num_conv_layers[3]):
+    cnn.conv(512, 3, 3)
+  cnn.mpool(2, 2)
+  for _ in xrange(num_conv_layers[4]):
+    cnn.conv(512, 3, 3)
+  cnn.mpool(2, 2)
+  cnn.reshape([-1, 512 * 7 * 7])
+  cnn.affine(4096)
+  cnn.dropout()
+  cnn.affine(4096)
+  cnn.dropout()
+
+
+class Vgg11Model(model.CNNModel):
+
+  def __init__(self, params=None):
+    super(Vgg11Model, self).__init__('vgg11', 224, 64, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    _construct_vgg(cnn, [1, 1, 2, 2, 2])
+
+
+class Vgg16Model(model.CNNModel):
+
+  def __init__(self, params=None):
+    super(Vgg16Model, self).__init__('vgg16', 224, 64, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    _construct_vgg(cnn, [2, 2, 3, 3, 3])
+
+
+class Vgg19Model(model.CNNModel):
+
+  def __init__(self, params=None):
+    super(Vgg19Model, self).__init__('vgg19', 224, 64, 0.005, params=params)
+
+  def add_inference(self, cnn):
+    _construct_vgg(cnn, [2, 2, 4, 4, 4])
diff --git a/cv/classification/resnet50/tensorflow/platforms/__init__.py b/cv/classification/resnet50/tensorflow/platforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/cv/classification/resnet50/tensorflow/platforms/default/__init__.py b/cv/classification/resnet50/tensorflow/platforms/default/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/cv/classification/resnet50/tensorflow/platforms/default/util.py b/cv/classification/resnet50/tensorflow/platforms/default/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e64b9137fa6ccc5d12b07126dcf30265574eae41
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/platforms/default/util.py
@@ -0,0 +1,90 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utility code for the default platform."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import tempfile
+
+import cnn_util
+from models import model_config
+
+
+_ROOT_PROJECT_DIR = os.path.dirname(cnn_util.__file__)
+
+
+def define_platform_params():
+  """Defines platform-specific parameters.
+
+  Currently there are no platform-specific parameters to be defined.
+  """
+  pass
+
+
+def get_cluster_manager(params, config_proto):
+  """Returns the cluster manager to be used."""
+  return cnn_util.GrpcClusterManager(params, config_proto)
+
+
+def get_command_to_run_python_module(module):
+  """Returns a command to run a Python module."""
+  python_interpretter = sys.executable
+  if not python_interpretter:
+    raise ValueError('Could not find Python interpreter')
+  return [python_interpretter,
+          os.path.join(_ROOT_PROJECT_DIR, module + '.py')]
+
+
+def get_test_output_dir():
+  """Returns a directory where test outputs should be placed."""
+  base_dir = os.environ.get('TEST_OUTPUTS_DIR',
+                            '/tmp/tf_cnn_benchmarks_test_outputs')
+  if not os.path.exists(base_dir):
+    os.mkdir(base_dir)
+  return tempfile.mkdtemp(dir=base_dir)
+
+
+def get_test_data_dir():
+  """Returns the path to the test_data directory."""
+  return os.path.join(_ROOT_PROJECT_DIR, 'test_data')
+
+
+def get_ssd_backborn_model_file():
+  raise NotImplementedError
+
+
+def get_ssd_backboard_data_dir():
+  raise NotImplementedError
+
+
+def _initialize(params, config_proto):
+  del params, config_proto
+  model_config.register_tf1_models()
+
+
+_is_initalized = False
+
+
+def initialize(params, config_proto):
+  global _is_initalized
+  if _is_initalized:
+    return
+  _is_initalized = True
+  _initialize(params, config_proto)
diff --git a/cv/classification/resnet50/tensorflow/platforms/util.py b/cv/classification/resnet50/tensorflow/platforms/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d569691bdec804080d62d11f8a200cd1ec2f2a9
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/platforms/util.py
@@ -0,0 +1,30 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utility code for a certain platform.
+
+This file simply imports everything from the default platform. To switch to a
+different platform, the import statement can be changed to point to a new
+platform.
+
+Creating a custom platform can be useful to, e.g., run some initialization code
+required by the platform or register a platform-specific model.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from platforms.default.util import *  # pylint: disable=unused-import,wildcard-import
diff --git a/cv/classification/resnet50/tensorflow/preprocessing.py b/cv/classification/resnet50/tensorflow/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..43cca8c2adc810150c726f07994c0042f3f4b7f4
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/preprocessing.py
@@ -0,0 +1,1336 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Image pre-processing utilities.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+import cnn_util
+try:
+    from tensorflow.python.data.experimental.ops import threadpool
+except: 
+    threadpool = None
+from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.framework import function
+from tensorflow.python.layers import utils
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.platform import gfile
+import mlperf
+import numpy as np
+
+tf.random.set_random_seed(42)
+np.random.seed(42)
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields:
+
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    text: Tensor tf.string containing the human-readable label.
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
+                                          default_value=''),
+      'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
+                                              default_value=-1),
+      'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
+                                             default_value=''),
+  }
+  sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+      {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                   'image/object/bbox/ymin',
+                                   'image/object/bbox/xmax',
+                                   'image/object/bbox/ymax']})
+
+  features = tf.parse_single_example(example_serialized, feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(bbox, [0, 2, 1])
+
+  return features['image/encoded'], label, bbox, features['image/class/text']
+
+
+_RESIZE_METHOD_MAP = {
+    'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+    'bilinear': tf.image.ResizeMethod.BILINEAR,
+    'bicubic': tf.image.ResizeMethod.BICUBIC,
+    'area': tf.image.ResizeMethod.AREA
+}
+
+
+def get_image_resize_method(resize_method, batch_position=0):
+  """Get tensorflow resize method.
+
+  If resize_method is 'round_robin', return different methods based on batch
+  position in a round-robin fashion. NOTE: If the batch size is not a multiple
+  of the number of methods, then the distribution of methods will not be
+  uniform.
+
+  Args:
+    resize_method: (string) nearest, bilinear, bicubic, area, or round_robin.
+    batch_position: position of the image in a batch. NOTE: this argument can
+      be an integer or a tensor
+  Returns:
+    one of resize type defined in tf.image.ResizeMethod.
+  """
+
+  if resize_method != 'round_robin':
+    return _RESIZE_METHOD_MAP[resize_method]
+
+  # return a resize method based on batch position in a round-robin fashion.
+  resize_methods = list(_RESIZE_METHOD_MAP.values())
+  def lookup(index):
+    return resize_methods[index]
+
+  def resize_method_0():
+    return utils.smart_cond(batch_position % len(resize_methods) == 0,
+                            lambda: lookup(0), resize_method_1)
+
+  def resize_method_1():
+    return utils.smart_cond(batch_position % len(resize_methods) == 1,
+                            lambda: lookup(1), resize_method_2)
+
+  def resize_method_2():
+    return utils.smart_cond(batch_position % len(resize_methods) == 2,
+                            lambda: lookup(2), lambda: lookup(3))
+
+  # NOTE(jsimsa): Unfortunately, we cannot use a single recursive function here
+  # because TF would not be able to construct a finite graph.
+
+  return resize_method_0()
+
+
+def decode_jpeg(image_buffer, scope=None):  # , dtype=tf.float32):
+  """Decode a JPEG string into one 3-D float image Tensor.
+
+  Args:
+    image_buffer: scalar string Tensor.
+    scope: Optional scope for op_scope.
+  Returns:
+    3-D float Tensor with values ranging from [0, 1).
+  """
+  # with tf.op_scope([image_buffer], scope, 'decode_jpeg'):
+  # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]):
+  with tf.name_scope(scope or 'decode_jpeg'):
+    # Decode the string as an RGB JPEG.
+    # Note that the resulting image contains an unknown height and width
+    # that is set dynamically by decode_jpeg. In other words, the height
+    # and width of image is unknown at compile-time.
+    image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                 fancy_upscaling=False,
+                                 dct_method='INTEGER_FAST')
+
+    # image = tf.Print(image, [tf.shape(image)], 'Image shape: ')
+
+    return image
+
+
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+
+
+def normalized_image(images):
+  # Rescale from [0, 255] to [0, 2]
+  images = tf.multiply(images, 1. / 127.5)
+  # Rescale to [-1, 1]
+  mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION, value=[1.0] * 3)
+  return tf.subtract(images, 1.0)
+
+
+def eval_image(image,
+               height,
+               width,
+               batch_position,
+               resize_method,
+               summary_verbosity=0):
+  """Get the image for model evaluation.
+
+  We preprocess the image simiarly to Slim, see
+  https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/vgg_preprocessing.py
+  Validation images do not have bounding boxes, so to crop the image, we first
+  resize the image such that the aspect ratio is maintained and the resized
+  height and width are both at least 1.145 times `height` and `width`
+  respectively. Then, we do a central crop to size (`height`, `width`).
+
+  Args:
+    image: 3-D float Tensor representing the image.
+    height: The height of the image that will be returned.
+    width: The width of the image that will be returned.
+    batch_position: position of the image in a batch, which affects how images
+      are distorted and resized. NOTE: this argument can be an integer or a
+      tensor
+    resize_method: one of the strings 'round_robin', 'nearest', 'bilinear',
+      'bicubic', or 'area'.
+    summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both
+      summaries and checkpoints.
+  Returns:
+    An image of size (output_height, output_width, 3) that is resized and
+    cropped as described above.
+  """
+  # TODO(reedwm): Currently we resize then crop. Investigate if it's faster to
+  # crop then resize.
+  with tf.name_scope('eval_image'):
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'original_image', tf.expand_dims(image, 0))
+
+    shape = tf.shape(image)
+    image_height = shape[0]
+    image_width = shape[1]
+    image_height_float = tf.cast(image_height, tf.float32)
+    image_width_float = tf.cast(image_width, tf.float32)
+
+    # This value is chosen so that in resnet, images are cropped to a size of
+    # 256 x 256, which matches what other implementations do. The final image
+    # size for resnet is 224 x 224, and floor(224 * 1.145) = 256.
+    scale_factor = 1.145
+
+    # Compute resize_height and resize_width to be the minimum values such that
+    #   1. The aspect ratio is maintained (i.e. resize_height / resize_width is
+    #      image_height / image_width), and
+    #   2. resize_height >= height * `scale_factor`, and
+    #   3. resize_width >= width * `scale_factor`
+    max_ratio = tf.maximum(height / image_height_float,
+                           width / image_width_float)
+    resize_height = tf.cast(image_height_float * max_ratio * scale_factor,
+                            tf.int32)
+    resize_width = tf.cast(image_width_float * max_ratio * scale_factor,
+                           tf.int32)
+    mlperf.logger.log_input_resize_aspect_preserving(height, width,
+                                                     scale_factor)
+
+    # Resize the image to shape (`resize_height`, `resize_width`)
+    image_resize_method = get_image_resize_method(resize_method, batch_position)
+    distorted_image = tf.image.resize_images(image,
+                                             [resize_height, resize_width],
+                                             image_resize_method,
+                                             align_corners=False)
+
+    # Do a central crop of the image to size (height, width).
+    # MLPerf requires us to log (height, width) with two different keys.
+    mlperf.logger.log(key=mlperf.tags.INPUT_CENTRAL_CROP, value=[height, width])
+    mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width])
+    total_crop_height = (resize_height - height)
+    crop_top = total_crop_height // 2
+    total_crop_width = (resize_width - width)
+    crop_left = total_crop_width // 2
+    distorted_image = tf.slice(distorted_image, [crop_top, crop_left, 0],
+                               [height, width, 3])
+
+    distorted_image.set_shape([height, width, 3])
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'cropped_resized_image', tf.expand_dims(distorted_image, 0))
+    image = distorted_image
+  return image
+
+
+def train_image(image_buffer,
+                height,
+                width,
+                bbox,
+                batch_position,
+                resize_method,
+                distortions,
+                scope=None,
+                summary_verbosity=0,
+                distort_color_in_yiq=False,
+                fuse_decode_and_crop=False):
+  """Distort one image for training a network.
+
+  Distorting images provides a useful technique for augmenting the data
+  set during training in order to make the network invariant to aspects
+  of the image that do not effect the label.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    height: integer
+    width: integer
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged
+      as [ymin, xmin, ymax, xmax].
+    batch_position: position of the image in a batch, which affects how images
+      are distorted and resized. NOTE: this argument can be an integer or a
+      tensor
+    resize_method: round_robin, nearest, bilinear, bicubic, or area.
+    distortions: If true, apply full distortions for image colors.
+    scope: Optional scope for op_scope.
+    summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both
+      summaries and checkpoints.
+    distort_color_in_yiq: distort color of input images in YIQ space.
+    fuse_decode_and_crop: fuse the decode/crop operation.
+  Returns:
+    3-D float Tensor of distorted image used for training.
+  """
+  # with tf.op_scope([image, height, width, bbox], scope, 'distort_image'):
+  # with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]):
+  with tf.name_scope(scope or 'distort_image'):
+    # A large fraction of image datasets contain a human-annotated bounding box
+    # delineating the region of the image containing the object of interest.  We
+    # choose to create a new bounding box for the object which is a randomly
+    # distorted version of the human-annotated bounding box that obeys an
+    # allowed range of aspect ratios, sizes and overlap with the human-annotated
+    # bounding box. If no box is supplied, then we assume the bounding box is
+    # the entire image.
+    min_object_covered = 0.1
+    aspect_ratio_range = [0.75, 1.33]
+    area_range = [0.05, 1.0]
+    max_attempts = 100
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MIN_OBJ_COV,
+                      value=min_object_covered)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_RATIO_RANGE,
+                      value=aspect_ratio_range)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_AREA_RANGE,
+                      value=area_range)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MAX_ATTEMPTS,
+                      value=max_attempts)
+
+    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+        tf.image.extract_jpeg_shape(image_buffer),
+        bounding_boxes=bbox,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=True)
+    bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
+    if summary_verbosity >= 3:
+      image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                   dct_method='INTEGER_FAST')
+      image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+      image_with_distorted_box = tf.image.draw_bounding_boxes(
+          tf.expand_dims(image, 0), distort_bbox)
+      tf.summary.image(
+          'images_with_distorted_bounding_box',
+          image_with_distorted_box)
+
+    # Crop the image to the specified bounding box.
+    if fuse_decode_and_crop:
+      offset_y, offset_x, _ = tf.unstack(bbox_begin)
+      target_height, target_width, _ = tf.unstack(bbox_size)
+      crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+      image = tf.image.decode_and_crop_jpeg(
+          image_buffer, crop_window, channels=3)
+    else:
+      image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                   dct_method='INTEGER_FAST')
+      image = tf.slice(image, bbox_begin, bbox_size)
+
+    mlperf.logger.log(key=mlperf.tags.INPUT_RANDOM_FLIP)
+    distorted_image = tf.image.random_flip_left_right(image)
+
+    # This resizing operation may distort the images because the aspect
+    # ratio is not respected.
+    mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width])
+    image_resize_method = get_image_resize_method(resize_method, batch_position)
+    distorted_image = tf.image.resize_images(
+        distorted_image, [height, width],
+        image_resize_method,
+        align_corners=False)
+    # Restore the shape since the dynamic slice based upon the bbox_size loses
+    # the third dimension.
+    distorted_image.set_shape([height, width, 3])
+    if summary_verbosity >= 3:
+      tf.summary.image('cropped_resized_maybe_flipped_image',
+                       tf.expand_dims(distorted_image, 0))
+
+    if distortions:
+      distorted_image = tf.cast(distorted_image, dtype=tf.float32)
+      # Images values are expected to be in [0,1] for color distortion.
+      distorted_image /= 255.
+      # Randomly distort the colors.
+      distorted_image = distort_color(distorted_image, batch_position,
+                                      distort_color_in_yiq=distort_color_in_yiq)
+
+      # Note: This ensures the scaling matches the output of eval_image
+      distorted_image *= 255
+
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'final_distorted_image',
+          tf.expand_dims(distorted_image, 0))
+    return distorted_image
+
+
+def distort_color(image, batch_position=0, distort_color_in_yiq=False,
+                  scope=None):
+  """Distort the color of the image.
+
+  Each color distortion is non-commutative and thus ordering of the color ops
+  matters. Ideally we would randomly permute the ordering of the color ops.
+  Rather then adding that level of complication, we select a distinct ordering
+  of color ops based on the position of the image in a batch.
+
+  Args:
+    image: float32 Tensor containing single image. Tensor values should be in
+      range [0, 1].
+    batch_position: the position of the image in a batch. NOTE: this argument
+      can be an integer or a tensor
+    distort_color_in_yiq: distort color of input images in YIQ space.
+    scope: Optional scope for op_scope.
+  Returns:
+    color-distorted image
+  """
+  if distort_color_in_yiq:
+    try:
+      from tensorflow.contrib.image.python.ops import distort_image_ops  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ValueError(
+          'In TF2, you cannot pass --distortions unless you also pass '
+          '--nodistort_color_in_yiq. This is because the random_hsv_in_yiq was '
+          'removed in TF2. --distortions does not improve accuracy on resnet '
+          'so it is not recommended. --nodistort_color_in_yiq also has no '
+          'impact on accuracy, but may hurt performance.')
+
+  with tf.name_scope(scope or 'distort_color'):
+
+    def distort_fn_0(image=image):
+      """Variant 0 of distort function."""
+      image = tf.image.random_brightness(image, max_delta=32. / 255.)
+      if distort_color_in_yiq:
+        image = distort_image_ops.random_hsv_in_yiq(
+            image, lower_saturation=0.5, upper_saturation=1.5,
+            max_delta_hue=0.2 * math.pi)
+      else:
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+      image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+      return image
+
+    def distort_fn_1(image=image):
+      """Variant 1 of distort function."""
+      image = tf.image.random_brightness(image, max_delta=32. / 255.)
+      image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+      if distort_color_in_yiq:
+        image = distort_image_ops.random_hsv_in_yiq(
+            image, lower_saturation=0.5, upper_saturation=1.5,
+            max_delta_hue=0.2 * math.pi)
+      else:
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+      return image
+
+    image = utils.smart_cond(batch_position % 2 == 0, distort_fn_0,
+                             distort_fn_1)
+    # The random_* ops do not necessarily clamp.
+    image = tf.clip_by_value(image, 0.0, 1.0)
+    return image
+
+
+class InputPreprocessor(object):
+  """Base class for all model preprocessors."""
+
+  def __init__(self, batch_size, output_shapes):
+    self.batch_size = batch_size
+    self.output_shapes = output_shapes
+
+  def supports_datasets(self):
+    """Whether this preprocessor supports dataset."""
+    return False
+
+  def minibatch(self, dataset, subset, params, shift_ratio=-1):
+    """Returns tensors representing a minibatch of all the input."""
+    raise NotImplementedError('Must be implemented by subclass.')
+
+  # The methods added below are only supported/used if supports_datasets()
+  # returns True.
+  # TODO(laigd): refactor benchmark_cnn.py and put the logic of
+  # _build_input_processing() into InputPreprocessor.
+
+  def parse_and_preprocess(self, value, batch_position):
+    """Function to parse and preprocess an Example proto in input pipeline."""
+    raise NotImplementedError('Must be implemented by subclass.')
+
+  # TODO(laigd): figure out how to remove these parameters, since the
+  # preprocessor itself has self.batch_size, self.num_splits, etc defined.
+  def build_multi_device_iterator(self, batch_size, num_splits, cpu_device,
+                                  params, gpu_devices, dataset, doing_eval):
+    """Creates a MultiDeviceIterator."""
+    assert self.supports_datasets()
+    assert num_splits == len(gpu_devices)
+    with tf.name_scope('batch_processing'):
+      if doing_eval:
+        subset = 'validation'
+      else:
+        subset = 'train'
+      batch_size_per_split = batch_size // num_splits
+      ds = self.create_dataset(
+          batch_size,
+          num_splits,
+          batch_size_per_split,
+          dataset,
+          subset,
+          train=(not doing_eval),
+          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+          num_threads=params.datasets_num_private_threads,
+          datasets_use_caching=params.datasets_use_caching,
+          datasets_parallel_interleave_cycle_length=(
+              params.datasets_parallel_interleave_cycle_length),
+          datasets_sloppy_parallel_interleave=(
+              params.datasets_sloppy_parallel_interleave),
+          datasets_parallel_interleave_prefetch=(
+              params.datasets_parallel_interleave_prefetch))
+      multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          ds,
+          gpu_devices,
+          source_device=cpu_device,
+          max_buffer_size=params.multi_device_iterator_max_buffer_size)
+      tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
+                           multi_device_iterator.initializer)
+      return multi_device_iterator
+
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    raise NotImplementedError('Must be implemented by subclass.')
+
+  def create_iterator(self, ds):
+    ds_iterator = tf.data.make_initializable_iterator(ds)
+    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
+                         ds_iterator.initializer)
+    return ds_iterator
+
+  def minibatch_fn(self, batch_size, model_input_shapes, num_splits,
+                   dataset, subset, train, datasets_repeat_cached_sample,
+                   num_threads, datasets_use_caching,
+                   datasets_parallel_interleave_cycle_length,
+                   datasets_sloppy_parallel_interleave,
+                   datasets_parallel_interleave_prefetch):
+    """Returns a function and list of args for the fn to create a minibatch."""
+    assert self.supports_datasets()
+    batch_size_per_split = batch_size // num_splits
+    assert batch_size_per_split == model_input_shapes[0][0]
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(batch_size, num_splits, batch_size_per_split,
+                               dataset, subset, train,
+                               datasets_repeat_cached_sample, num_threads,
+                               datasets_use_caching,
+                               datasets_parallel_interleave_cycle_length,
+                               datasets_sloppy_parallel_interleave,
+                               datasets_parallel_interleave_prefetch)
+      ds_iterator = self.create_iterator(ds)
+
+      ds_iterator_string_handle = ds_iterator.string_handle()
+
+      @function.Defun(tf.string)
+      def _fn(h):
+        remote_iterator = tf.data.Iterator.from_string_handle(
+            h, ds_iterator.output_types, ds_iterator.output_shapes)
+        input_list = remote_iterator.get_next()
+        reshaped_input_list = [
+            tf.reshape(input_list[i], shape=model_input_shapes[i])
+            for i in range(len(input_list))
+        ]
+        return reshaped_input_list
+
+      return _fn, [ds_iterator_string_handle]
+
+
+class BaseImagePreprocessor(InputPreprocessor):
+  """Base class for all image model preprocessors."""
+
+  def __init__(self,
+               batch_size,
+               output_shapes,
+               num_splits,
+               dtype,
+               train,
+               distortions,
+               resize_method,
+               shift_ratio=-1,
+               summary_verbosity=0,
+               distort_color_in_yiq=True,
+               fuse_decode_and_crop=True,
+               match_mlperf=False):
+    super(BaseImagePreprocessor, self).__init__(batch_size, output_shapes)
+    image_shape = output_shapes[0]
+    # image_shape is in form (batch_size, height, width, depth)
+    self.height = image_shape[1]
+    self.width = image_shape[2]
+    self.depth = image_shape[3]
+    self.num_splits = num_splits
+    self.dtype = dtype
+    self.train = train
+    self.resize_method = resize_method
+    self.shift_ratio = shift_ratio
+    self.distortions = distortions
+    self.distort_color_in_yiq = distort_color_in_yiq
+    self.fuse_decode_and_crop = fuse_decode_and_crop
+    if self.batch_size % self.num_splits != 0:
+      raise ValueError(
+          ('batch_size must be a multiple of num_splits: '
+           'batch_size %d, num_splits: %d') %
+          (self.batch_size, self.num_splits))
+    self.batch_size_per_split = self.batch_size // self.num_splits
+    self.summary_verbosity = summary_verbosity
+    self.match_mlperf = match_mlperf
+
+  def parse_and_preprocess(self, value, batch_position):
+    assert self.supports_datasets()
+    image_buffer, label_index, bbox, _ = parse_example_proto(value)
+    if self.match_mlperf:
+      bbox = tf.zeros((1, 0, 4), dtype=bbox.dtype)
+      mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=False)
+    else:
+      mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=True)
+    image = self.preprocess(image_buffer, bbox, batch_position)
+    return (image, label_index)
+
+  def preprocess(self, image_buffer, bbox, batch_position):
+    raise NotImplementedError('Must be implemented by subclass.')
+
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    assert self.supports_datasets()
+    glob_pattern = dataset.tf_record_pattern(subset)
+    file_names = gfile.Glob(glob_pattern)
+    if not file_names:
+      raise ValueError('Found no files in --data_dir matching: {}'
+                       .format(glob_pattern))
+    ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train)
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave,
+            prefetch_input_elements=datasets_parallel_interleave_prefetch))
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    counter = tf.data.Dataset.range(batch_size)
+    counter = counter.repeat()
+    ds = tf.data.Dataset.zip((ds, counter))
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      buffer_size = 10000
+      mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=buffer_size)
+      ds = ds.apply(
+          tf.data.experimental.shuffle_and_repeat(buffer_size=buffer_size))
+    else:
+      ds = ds.repeat()
+    ds = ds.apply(
+        tf.data.experimental.map_and_batch(
+            map_func=self.parse_and_preprocess,
+            batch_size=batch_size_per_split,
+            num_parallel_batches=num_splits))
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads and threadpool is not None:
+      ds = threadpool.override_threadpool(
+          ds,
+          threadpool.PrivateThreadPool(
+              num_threads, display_name='input_pipeline_thread_pool'))
+    return ds
+
+
+class RecordInputImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor for images with RecordInput format."""
+
+  def preprocess(self, image_buffer, bbox, batch_position):
+    """Preprocessing image_buffer as a function of its batch position."""
+    if self.train:
+      image = train_image(image_buffer, self.height, self.width, bbox,
+                          batch_position, self.resize_method, self.distortions,
+                          None, summary_verbosity=self.summary_verbosity,
+                          distort_color_in_yiq=self.distort_color_in_yiq,
+                          fuse_decode_and_crop=self.fuse_decode_and_crop)
+    else:
+      image = tf.image.decode_jpeg(
+          image_buffer, channels=3, dct_method='INTEGER_FAST')
+      image = eval_image(image, self.height, self.width, batch_position,
+                         self.resize_method,
+                         summary_verbosity=self.summary_verbosity)
+    # Note: image is now float32 [height,width,3] with range [0, 255]
+
+    # image = tf.cast(image, tf.uint8) # HACK TESTING
+
+    if self.match_mlperf:
+      mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION,
+                        value=_CHANNEL_MEANS)
+      normalized = image - _CHANNEL_MEANS
+    else:
+      normalized = normalized_image(image)
+    return tf.cast(normalized, self.dtype)
+
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    if shift_ratio < 0:
+      shift_ratio = self.shift_ratio
+    with tf.name_scope('batch_processing'):
+      # Build final results per split.
+      images = [[] for _ in range(self.num_splits)]
+      labels = [[] for _ in range(self.num_splits)]
+      if params.use_datasets:
+        ds = self.create_dataset(
+            self.batch_size, self.num_splits, self.batch_size_per_split,
+            dataset, subset, self.train,
+            datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+            num_threads=params.datasets_num_private_threads,
+            datasets_use_caching=params.datasets_use_caching,
+            datasets_parallel_interleave_cycle_length=(
+                params.datasets_parallel_interleave_cycle_length),
+            datasets_sloppy_parallel_interleave=(
+                params.datasets_sloppy_parallel_interleave),
+            datasets_parallel_interleave_prefetch=(
+                params.datasets_parallel_interleave_prefetch))
+        ds_iterator = self.create_iterator(ds)
+        for d in xrange(self.num_splits):
+          images[d], labels[d] = ds_iterator.get_next()
+
+      # TODO(laigd): consider removing the --use_datasets option, it should
+      # always use datasets.
+      else:
+        record_input = data_flow_ops.RecordInput(
+            file_pattern=dataset.tf_record_pattern(subset),
+            seed=301,
+            parallelism=64,
+            buffer_size=10000,
+            batch_size=self.batch_size,
+            shift_ratio=shift_ratio,
+            name='record_input')
+        records = record_input.get_yield_op()
+        records = tf.split(records, self.batch_size, 0)
+        records = [tf.reshape(record, []) for record in records]
+        for idx in xrange(self.batch_size):
+          value = records[idx]
+          (image, label) = self.parse_and_preprocess(value, idx)
+          split_index = idx % self.num_splits
+          labels[split_index].append(label)
+          images[split_index].append(image)
+
+      for split_index in xrange(self.num_splits):
+        if not params.use_datasets:
+          images[split_index] = tf.parallel_stack(images[split_index])
+          labels[split_index] = tf.concat(labels[split_index], 0)
+        images[split_index] = tf.reshape(
+            images[split_index],
+            shape=[self.batch_size_per_split, self.height, self.width,
+                   self.depth])
+        labels[split_index] = tf.reshape(labels[split_index],
+                                         [self.batch_size_per_split])
+      return images, labels
+
+  def supports_datasets(self):
+    return True
+
+
+class ImagenetPreprocessor(RecordInputImagePreprocessor):
+
+  def preprocess(self, image_buffer, bbox, batch_position):
+    # pylint: disable=g-import-not-at-top
+    try:
+      from official.r1.resnet.imagenet_preprocessing import preprocess_image
+    except ImportError:
+      tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.')
+      raise
+    if self.train:
+      image = preprocess_image(
+          image_buffer, bbox, self.height, self.width, self.depth,
+          is_training=True)
+    else:
+      image = preprocess_image(
+          image_buffer, bbox, self.height, self.width, self.depth,
+          is_training=False)
+    return tf.cast(image, self.dtype)
+
+
+class Cifar10ImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor for Cifar10 input images."""
+
+  def _distort_image(self, image):
+    """Distort one image for training a network.
+
+    Adopted the standard data augmentation scheme that is widely used for
+    this dataset: the images are first zero-padded with 4 pixels on each side,
+    then randomly cropped to again produce distorted images; half of the images
+    are then horizontally mirrored.
+
+    Args:
+      image: input image.
+    Returns:
+      distorted image.
+    """
+    image = tf.image.resize_image_with_crop_or_pad(
+        image, self.height + 8, self.width + 8)
+    distorted_image = tf.random_crop(image,
+                                     [self.height, self.width, self.depth])
+    # Randomly flip the image horizontally.
+    distorted_image = tf.image.random_flip_left_right(distorted_image)
+    if self.summary_verbosity >= 3:
+      tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0))
+    return distorted_image
+
+  def _eval_image(self, image):
+    """Get the image for model evaluation."""
+    distorted_image = tf.image.resize_image_with_crop_or_pad(
+        image, self.width, self.height)
+    if self.summary_verbosity >= 3:
+      tf.summary.image('cropped.image', tf.expand_dims(distorted_image, 0))
+    return distorted_image
+
+  def preprocess(self, raw_image):
+    """Preprocessing raw image."""
+    if self.summary_verbosity >= 3:
+      tf.summary.image('raw.image', tf.expand_dims(raw_image, 0))
+    if self.train and self.distortions:
+      image = self._distort_image(raw_image)
+    else:
+      image = self._eval_image(raw_image)
+    normalized = normalized_image(image)
+    return tf.cast(normalized, self.dtype)
+
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    # TODO(jsimsa): Implement datasets code path
+    del shift_ratio, params
+    with tf.name_scope('batch_processing'):
+      all_images, all_labels = dataset.read_data_files(subset)
+      all_images = tf.constant(all_images)
+      all_labels = tf.constant(all_labels)
+      input_image, input_label = tf.train.slice_input_producer(
+          [all_images, all_labels])
+      input_image = tf.cast(input_image, self.dtype)
+      input_label = tf.cast(input_label, tf.int32)
+      # Ensure that the random shuffling has good mixing properties.
+      min_fraction_of_examples_in_queue = 0.4
+      min_queue_examples = int(dataset.num_examples_per_epoch(subset) *
+                               min_fraction_of_examples_in_queue)
+      raw_images, raw_labels = tf.train.shuffle_batch(
+          [input_image, input_label], batch_size=self.batch_size,
+          capacity=min_queue_examples + 3 * self.batch_size,
+          min_after_dequeue=min_queue_examples)
+
+      images = [[] for i in range(self.num_splits)]
+      labels = [[] for i in range(self.num_splits)]
+
+      # Create a list of size batch_size, each containing one image of the
+      # batch. Without the unstack call, raw_images[i] would still access the
+      # same image via a strided_slice op, but would be slower.
+      raw_images = tf.unstack(raw_images, axis=0)
+      raw_labels = tf.unstack(raw_labels, axis=0)
+      for i in xrange(self.batch_size):
+        split_index = i % self.num_splits
+        # The raw image read from data has the format [depth, height, width]
+        # reshape to the format returned by minibatch.
+        raw_image = tf.reshape(raw_images[i],
+                               [dataset.depth, dataset.height, dataset.width])
+        raw_image = tf.transpose(raw_image, [1, 2, 0])
+        image = self.preprocess(raw_image)
+        images[split_index].append(image)
+
+        labels[split_index].append(raw_labels[i])
+
+      for split_index in xrange(self.num_splits):
+        images[split_index] = tf.parallel_stack(images[split_index])
+        labels[split_index] = tf.parallel_stack(labels[split_index])
+      return images, labels
+
+
+class COCOPreprocessor(BaseImagePreprocessor):
+  """Preprocessor for COCO dataset input images, boxes, and labels."""
+
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    del shift_ratio  # Not used when using datasets instead of data_flow_ops
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(
+          self.batch_size, self.num_splits, self.batch_size_per_split,
+          dataset, subset, self.train, params.datasets_repeat_cached_sample)
+      ds_iterator = self.create_iterator(ds)
+
+      # Training data: 4 tuple
+      # Validation data: 5 tuple
+      # See get_input_shapes in models/ssd_model.py for details.
+      input_len = 4 if subset == 'train' else 5
+      input_lists = [[None for _ in range(self.num_splits)]
+                     for _ in range(input_len)]
+      for d in xrange(self.num_splits):
+        input_list = ds_iterator.get_next()
+        for i in range(input_len):
+          input_lists[i][d] = input_list[i]
+      return input_lists
+
+  def preprocess(self, data):
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+      import ssd_constants  # pylint: disable=g-import-not-at-top
+      from object_detection.core import preprocessor  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation')
+    image_buffer = data['image_buffer']
+    boxes = data['groundtruth_boxes']
+    classes = tf.reshape(data['groundtruth_classes'], [-1, 1])
+    source_id = tf.string_to_number(data['source_id'])
+    raw_shape = data['raw_shape']
+
+    ssd_encoder = ssd_dataloader.Encoder()
+
+    # Only 80 of the 90 COCO classes are used.
+    class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
+    classes = tf.gather(class_map, classes)
+    classes = tf.cast(classes, dtype=tf.float32)
+
+    if self.train:
+      image, boxes, classes = ssd_dataloader.ssd_decode_and_crop(
+          image_buffer, boxes, classes, raw_shape)
+      # ssd_crop resizes and returns image of dtype float32 and does not change
+      # its range (i.e., value in between 0--255). Divide by 255. converts it
+      # to [0, 1] range. Not doing this before cropping to avoid dtype cast
+      # (which incurs additional memory copy).
+      image /= 255.
+
+      image, boxes = preprocessor.random_horizontal_flip(
+          image=image, boxes=boxes)
+      # Random horizontal flip probability is 50%
+      # See https://github.com/tensorflow/models/blob/master/research/object_detection/core/preprocessor.py  # pylint: disable=line-too-long
+      mlperf.logger.log(key=mlperf.tags.RANDOM_FLIP_PROBABILITY, value=0.5)
+
+      image = tf.cast(image, self.dtype)
+
+      encoded_returns = ssd_encoder.encode_labels(boxes, classes)
+      encoded_classes, encoded_boxes, num_matched_boxes = encoded_returns
+
+      # Shape of image: [width, height, channel]
+      # Shape of encoded_boxes: [NUM_SSD_BOXES, 4]
+      # Shape of encoded_classes: [NUM_SSD_BOXES, 1]
+      # Shape of num_matched_boxes: [1]
+      return (image, encoded_boxes, encoded_classes, num_matched_boxes)
+
+    else:
+      image = tf.image.decode_jpeg(image_buffer)
+      image = tf.image.resize_images(
+          image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))
+      # resize_image returns image of dtype float32 and does not change its
+      # range. Divide by 255 to convert image to [0, 1] range.
+      image /= 255.
+
+      image = ssd_dataloader.normalize_image(image)
+      image = tf.cast(image, self.dtype)
+
+      def trim_and_pad(inp_tensor):
+        """Limit the number of boxes, and pad if necessary."""
+        inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES]
+        num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0]
+        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
+        return tf.reshape(inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES,
+                                       inp_tensor.get_shape()[1]])
+
+      boxes, classes = trim_and_pad(boxes), trim_and_pad(classes)
+
+      # Shape of boxes: [MAX_NUM_EVAL_BOXES, 4]
+      # Shape of classes: [MAX_NUM_EVAL_BOXES, 1]
+      # Shape of source_id: [] (scalar tensor)
+      # Shape of raw_shape: [3]
+      return (image, boxes, classes, source_id, raw_shape)
+
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation')
+    assert self.supports_datasets()
+
+    glob_pattern = dataset.tf_record_pattern(subset)
+    ds = tf.data.TFRecordDataset.list_files(glob_pattern, shuffle=train)
+    # TODO(haoyuzhang): Enable map+filter fusion after cl/218399112 in release
+    # options = tf.data.Options()
+    # options.experimental_optimization = tf.data.experimental.OptimizationOptions()  # pylint: disable=line-too-long
+    # options.experimental_optimization.map_and_filter_fusion = True
+    # ds = ds.with_options(options)
+
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave))
+    mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
+      mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=10000)
+      mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
+    else:
+      ds = ds.repeat()
+
+    ds = ds.map(ssd_dataloader.ssd_parse_example_proto, num_parallel_calls=64)
+    ds = ds.filter(
+        lambda data: tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0))
+    ds = ds.apply(
+        tf.data.experimental.map_and_batch(
+            map_func=self.preprocess,
+            batch_size=batch_size_per_split,
+            num_parallel_batches=num_splits,
+            drop_remainder=train))
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads:
+      ds = threadpool.override_threadpool(
+          ds,
+          threadpool.PrivateThreadPool(
+              num_threads, display_name='input_pipeline_thread_pool'))
+    return ds
+
+  def supports_datasets(self):
+    return True
+
+
+class TestImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor used for testing.
+
+  set_fake_data() sets which images and labels will be output by minibatch(),
+  and must be called before minibatch(). This allows tests to easily specify
+  a set of images to use for training, without having to create any files.
+
+  Queue runners must be started for this preprocessor to work.
+  """
+
+  def __init__(self,
+               batch_size,
+               output_shapes,
+               num_splits,
+               dtype,
+               train=None,
+               distortions=None,
+               resize_method=None,
+               shift_ratio=0,
+               summary_verbosity=0,
+               distort_color_in_yiq=False,
+               fuse_decode_and_crop=False,
+               match_mlperf=False):
+    super(TestImagePreprocessor, self).__init__(
+        batch_size, output_shapes, num_splits, dtype, train, distortions,
+        resize_method, shift_ratio, summary_verbosity=summary_verbosity,
+        distort_color_in_yiq=distort_color_in_yiq,
+        fuse_decode_and_crop=fuse_decode_and_crop, match_mlperf=match_mlperf)
+    self.expected_subset = None
+
+  def set_fake_data(self, fake_images, fake_labels):
+    assert len(fake_images.shape) == 4
+    assert len(fake_labels.shape) == 1
+    num_images = fake_images.shape[0]
+    assert num_images == fake_labels.shape[0]
+    assert num_images % self.batch_size == 0
+    self.fake_images = fake_images
+    self.fake_labels = fake_labels
+
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=0):
+    """Get test image batches."""
+    del dataset, params
+    if (not hasattr(self, 'fake_images') or
+        not hasattr(self, 'fake_labels')):
+      raise ValueError('Must call set_fake_data() before calling minibatch '
+                       'on TestImagePreprocessor')
+    if self.expected_subset is not None:
+      assert subset == self.expected_subset
+
+    shift_ratio = shift_ratio or self.shift_ratio
+    fake_images = cnn_util.roll_numpy_batches(self.fake_images, self.batch_size,
+                                              shift_ratio)
+    fake_labels = cnn_util.roll_numpy_batches(self.fake_labels, self.batch_size,
+                                              shift_ratio)
+
+    with tf.name_scope('batch_processing'):
+      image_slice, label_slice = tf.train.slice_input_producer(
+          [fake_images, fake_labels],
+          shuffle=False,
+          name='image_slice')
+      raw_images, raw_labels = tf.train.batch(
+          [image_slice, label_slice], batch_size=self.batch_size,
+          name='image_batch')
+      images = [[] for _ in range(self.num_splits)]
+      labels = [[] for _ in range(self.num_splits)]
+      for i in xrange(self.batch_size):
+        split_index = i % self.num_splits
+        raw_image = tf.cast(raw_images[i], self.dtype)
+        images[split_index].append(raw_image)
+        labels[split_index].append(raw_labels[i])
+      for split_index in xrange(self.num_splits):
+        images[split_index] = tf.parallel_stack(images[split_index])
+        labels[split_index] = tf.parallel_stack(labels[split_index])
+
+      normalized = [normalized_image(part) for part in images]
+      return [[tf.cast(part, self.dtype) for part in normalized], labels]
+
+
+class LibrispeechPreprocessor(InputPreprocessor):
+  """Preprocessor for librispeech class for all image model preprocessors."""
+
+  def __init__(self, batch_size, output_shapes, num_splits, dtype, train,
+               **kwargs):
+    del kwargs
+    super(LibrispeechPreprocessor, self).__init__(batch_size, output_shapes)
+    self.num_splits = num_splits
+    self.dtype = dtype
+    self.is_train = train
+    if self.batch_size % self.num_splits != 0:
+      raise ValueError(('batch_size must be a multiple of num_splits: '
+                        'batch_size %d, num_splits: %d') % (self.batch_size,
+                                                            self.num_splits))
+    self.batch_size_per_split = self.batch_size // self.num_splits
+
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    # TODO(laigd): currently the only difference between this and the one in
+    # BaseImagePreprocessor is, this uses map() and padded_batch() while the
+    # latter uses tf.data.experimental.map_and_batch(). Try to merge them.
+    assert self.supports_datasets()
+    glob_pattern = dataset.tf_record_pattern(subset)
+    file_names = gfile.Glob(glob_pattern)
+    if not file_names:
+      raise ValueError('Found no files in --data_dir matching: {}'
+                       .format(glob_pattern))
+    ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train)
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave,
+            prefetch_input_elements=datasets_parallel_interleave_prefetch))
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    counter = tf.data.Dataset.range(batch_size)
+    counter = counter.repeat()
+    ds = tf.data.Dataset.zip((ds, counter))
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
+    else:
+      ds = ds.repeat()
+    ds = ds.map(map_func=self.parse_and_preprocess,
+                num_parallel_calls=batch_size_per_split*num_splits)
+    ds = ds.padded_batch(
+        batch_size=batch_size_per_split,
+        padded_shapes=tuple([
+            tf.TensorShape(output_shape[1:])
+            for output_shape in self.output_shapes
+        ]),
+        drop_remainder=True)
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads:
+      ds = threadpool.override_threadpool(
+          ds,
+          threadpool.PrivateThreadPool(
+              num_threads, display_name='input_pipeline_thread_pool'))
+    return ds
+
+  def minibatch(self, dataset, subset, params, shift_ratio=-1):
+    assert params.use_datasets
+    # TODO(laigd): unify this with CNNModel's minibatch()
+    # TODO(laigd): in distributed mode we use shift_ratio so different workers
+    # won't work on same inputs, so we should respect that.
+    del shift_ratio
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(
+          self.batch_size,
+          self.num_splits,
+          self.batch_size_per_split,
+          dataset,
+          subset,
+          self.is_train,
+          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+          num_threads=params.datasets_num_private_threads,
+          datasets_use_caching=params.datasets_use_caching,
+          datasets_parallel_interleave_cycle_length=(
+              params.datasets_parallel_interleave_cycle_length),
+          datasets_sloppy_parallel_interleave=(
+              params.datasets_sloppy_parallel_interleave),
+          datasets_parallel_interleave_prefetch=(
+              params.datasets_parallel_interleave_prefetch))
+      ds_iterator = self.create_iterator(ds)
+
+      # The four lists are: input spectrogram feature, labels, input lengths,
+      # label lengths
+      input_lists = [[None for _ in range(self.num_splits)] for _ in range(4)]
+      for d in xrange(self.num_splits):
+        input_list = ds_iterator.get_next()
+        for i in range(4):
+          input_lists[i][d] = input_list[i]
+
+      assert self.output_shapes == [
+          input_lists[i][0].shape.as_list() for i in range(4)
+      ]
+      return tuple(input_lists)
+
+  def supports_datasets(self):
+    return True
+
+  def parse_and_preprocess(self, value, batch_position):
+    """Parse an TFRecord."""
+    del batch_position
+    assert self.supports_datasets()
+    context_features = {
+        'labels': tf.VarLenFeature(dtype=tf.int64),
+        'input_length': tf.FixedLenFeature([], dtype=tf.int64),
+        'label_length': tf.FixedLenFeature([], dtype=tf.int64),
+    }
+    sequence_features = {
+        'features': tf.FixedLenSequenceFeature([161], dtype=tf.float32)
+    }
+    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
+        serialized=value,
+        context_features=context_features,
+        sequence_features=sequence_features,
+    )
+
+    return [
+        # Input
+        tf.expand_dims(sequence_parsed['features'], axis=2),
+        # Label
+        tf.cast(
+            tf.reshape(
+                tf.sparse_tensor_to_dense(context_parsed['labels']), [-1]),
+            dtype=tf.int32),
+        # Input length
+        tf.cast(
+            tf.reshape(context_parsed['input_length'], [1]),
+            dtype=tf.int32),
+        # Label length
+        tf.cast(
+            tf.reshape(context_parsed['label_length'], [1]),
+            dtype=tf.int32),
+    ]
diff --git a/cv/classification/resnet50/tensorflow/run_tests.py b/cv/classification/resnet50/tensorflow/run_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b3dcd3276c776a1a585181229fae19e691106e3
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/run_tests.py
@@ -0,0 +1,107 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs the tf_cnn_benchmarks tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unittest
+
+from absl import app
+from absl import flags as absl_flags
+import tensorflow.compat.v1 as tf
+
+import all_reduce_benchmark_test
+import allreduce_test
+import benchmark_cnn_distributed_test
+import benchmark_cnn_test
+import cnn_util_test
+import variable_mgr_util_test
+from models import model_config
+
+# Ideally, we wouldn't need this option, and run both distributed tests and non-
+# distributed tests. But, TensorFlow allocates all the GPU memory by default, so
+# the non-distributed tests allocate all the GPU memory. The distributed tests
+# spawn processes that run TensorFlow, and cannot run if all the GPU memory is
+# already allocated. If a non-distributed test is run, then a distributed test
+# is run in the same process, the distributed test will fail because there is no
+# more GPU memory for the spawned processes to allocate.
+absl_flags.DEFINE_boolean('run_distributed_tests', False,
+                          'If True, run the distributed tests. If False, the'
+                          'non-distributed tests.')
+
+absl_flags.DEFINE_boolean('full_tests', False,
+                          'If True, all distributed or non-distributed tests '
+                          'are run, which can take hours. If False, only a '
+                          'subset of tests will be run. This subset runs much '
+                          'faster and tests almost all the functionality as '
+                          'the full set of tests, so it is recommended to keep '
+                          'this option set to False.')
+
+FLAGS = absl_flags.FLAGS
+
+
+def main(_):
+  loader = unittest.defaultTestLoader
+  if FLAGS.full_tests:
+    suite = unittest.TestSuite([
+        loader.loadTestsFromModule(allreduce_test),
+        loader.loadTestsFromModule(cnn_util_test),
+        loader.loadTestsFromModule(variable_mgr_util_test),
+        loader.loadTestsFromModule(benchmark_cnn_test),
+        loader.loadTestsFromModule(all_reduce_benchmark_test),
+    ])
+    if model_config.can_import_contrib:
+      from models.tf1_only import nasnet_test  # pylint: disable=g-import-not-at-top
+      suite.addTest(loader.loadTestsFromModule(nasnet_test))
+    dist_suite = unittest.TestSuite([
+        loader.loadTestsFromModule(benchmark_cnn_distributed_test),
+    ])
+  else:
+    suite = unittest.TestSuite([
+        loader.loadTestsFromModule(allreduce_test),
+        loader.loadTestsFromModule(cnn_util_test),
+        loader.loadTestsFromModule(all_reduce_benchmark_test),
+        loader.loadTestsFromModule(variable_mgr_util_test),
+        loader.loadTestsFromTestCase(benchmark_cnn_test.TestAlexnetModel),
+        loader.loadTestsFromTestCase(benchmark_cnn_test.TfCnnBenchmarksTest),
+        loader.loadTestsFromTestCase(benchmark_cnn_test.VariableUpdateTest),
+        loader.loadTestsFromTestCase(
+            benchmark_cnn_test.VariableMgrLocalReplicatedTest),
+    ])
+    dist_suite = unittest.TestSuite([
+        loader.loadTestsFromNames([
+            'benchmark_cnn_distributed_test.DistributedVariableUpdateTest'
+            '.testVarUpdateDefault',
+
+            'benchmark_cnn_distributed_test.TfCnnBenchmarksDistributedTest'
+            '.testParameterServer',
+        ]),
+    ])
+
+  if FLAGS.run_distributed_tests:
+    print('Running distributed tests')
+    result = unittest.TextTestRunner(verbosity=2).run(dist_suite)
+  else:
+    print('Running non-distributed tests')
+    result = unittest.TextTestRunner(verbosity=2).run(suite)
+  sys.exit(not result.wasSuccessful())
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh
new file mode 100644
index 0000000000000000000000000000000000000000..98639e5c9f5656c7a46bcc5a1f00609c1170a3f9
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+bash ./get_imagenette.sh
+
+export TF_CUDNN_USE_AUTOTUNE=1
+export TF_CPP_MIN_LOG_LEVEL=1
+
+#################################################
+# Prepare training arguments
+#################################################
+
+i=0
+model="alexnet"
+for arg in "$@"
+do
+    if [ $i -eq 0 ]; then
+        model=$arg
+        let i++
+        continue
+    fi
+    if [[ $arg =~ "--epoch" ]]; then
+        new_args[$i]="--num_epochs"
+    else
+        new_args[$i]=$arg
+    fi
+    let i++
+done
+echo "## Training model: ${model}"
+
+
+: ${BATCH_SIZE:=32}
+# TRAIN_EPOCHS=10
+# optional optimizer: momentum, rmsprop, momentum, sgd
+OPTIMIZER=momentum
+DATE=`date +%Y%m%d%H%M%S`
+
+LOG_DIR="logs/${model}_distributed"
+DATA_DIR=./imagenette
+BASE_DIR=train_dir
+TRAIN_DIR=${BASE_DIR}/${model}_distributed
+
+mkdir -p ${LOG_DIR}
+mkdir -p ${BASE_DIR}
+rm -rf ${TRAIN_DIR}
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+#################################################
+# Prepare devices
+#################################################
+devices=$CUDA_VISIBLE_DEVICES
+if [ -n "$devices"  ]; then
+    devices=(${devices//,/ })
+    num_devices=${#devices[@]}
+else
+    devices=(0 1)
+    num_devices=2
+fi
+echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
+echo "num_devices: ${num_devices}"
+
+if [ "${num_devices}" == "1" ]; then
+    echo "Error: The number of devices must be greater then 1 for distributed training, but got CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}."
+    exit 0
+fi
+
+#################################################
+# Prepare distributed training arguments
+#################################################
+worker_hosts=""
+i=0
+for device in "${devices[@]}";
+do
+    if [ "$i" == "0" ]; then
+        let i++
+        continue
+    fi
+    let i++
+    worker_hosts="${worker_hosts},127.0.0.1:5000${device}"
+done
+worker_hosts=${worker_hosts#*,}
+echo "worker_hosts: ${worker_hosts}"
+
+#################################################
+# Handle CTRL-C
+#################################################
+trap ctrl_c INT
+function ctrl_c() {
+  echo "*** Trapped CTRL-C, killing process running background"
+  for pid in "${pid_list[@]}"; do
+    echo "Killing pid ${pid}"
+    kill ${pid}
+  done
+  exit 0
+}
+
+#################################################
+# Start distributed training
+#################################################
+
+pid_list=()
+last_device=`expr ${num_devices} - 1`
+i=0
+for device in "${devices[@]}";
+do
+    job_name="worker"
+    if [ "${i}" == "0" ]; then
+        job_name="ps"
+    fi
+
+    if [ ${i} -le 1 ]; then
+        task_index=0
+    else
+        task_index=`expr ${i} - 1`
+    fi
+
+    if [ "${i}" == "${last_device}" ]; then
+        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+         --data_name=imagenette --data_dir=${DATA_DIR}\
+         --data_format=NCHW \
+         --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\
+         --local_parameter_device=gpu --num_gpus=${num_devices}\
+         --batch_size=${BATCH_SIZE} --model=${model} \
+         --variable_update=distributed_replicated \
+         --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\
+         --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+        echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}"
+    else
+        CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+         --data_name=imagenette --data_dir=${DATA_DIR}\
+         --data_format=NCHW \
+         --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\
+         --local_parameter_device=gpu --num_gpus=${num_devices}\
+         --batch_size=${BATCH_SIZE} --model=${model}\
+         --variable_update=distributed_replicated\
+         --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\
+         --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" &
+        echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}"
+    fi
+    let i++
+    pid_list+=($!)
+done
+
+echo "All subprocess: ${pid_list[*]}"
+ctrl_c
+exit ${EXIT_STATUS}
diff --git a/cv/classification/resnet50/tensorflow/run_train_resnet50_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_resnet50_distributed_imagenette.sh
new file mode 100644
index 0000000000000000000000000000000000000000..667cf5bb067665ad83e1f9bac95c8c797f6d91b3
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/run_train_resnet50_distributed_imagenette.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+bash ./run_train_distributed_imagenette.sh resnet50 "$@"
+exit $?
\ No newline at end of file
diff --git a/cv/classification/resnet50/tensorflow/run_train_resnet50_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_resnet50_imagenette.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e65bfffb9851dfebc41ffc2eba38dd033588057e
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/run_train_resnet50_imagenette.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+bash ./get_imagenette.sh
+
+export TF_CUDNN_USE_AUTOTUNE=1
+export TF_CPP_MIN_LOG_LEVEL=1
+
+: ${BATCH_SIZE:=32}
+#TRAIN_EPOCHS=10
+# optional optimizer: adam, rmsprop, momentum, sgd
+OPTIMIZER=adam
+DATE=`date +%Y%m%d%H%M%S`
+
+LOG_DIR="logs/resnet50"
+DATA_DIR=./imagenette
+BASE_DIR=train_dir
+TRAIN_DIR=${BASE_DIR}/resnet50
+
+mkdir -p ${LOG_DIR}
+mkdir -p ${BASE_DIR}
+rm -rf ${TRAIN_DIR}
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+i=0
+for arg in "$@"
+do
+    if [[ $arg =~ "--epoch" ]]; then
+        new_args[$i]="--num_epochs"
+    else
+        new_args[$i]=$arg
+    fi
+    let i++
+done
+
+python3 -u tf_cnn_benchmarks.py\
+ --data_name=imagenette --data_dir=${DATA_DIR}\
+ --data_format=NCHW --batch_size=${BATCH_SIZE}\
+ --model=resnet50 --optimizer=${OPTIMIZER} --num_gpus=1\
+ --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
+ --eval_during_training_every_n_epochs=2\
+ --num_eval_epochs=1 --datasets_use_caching\
+ --stop_at_top_1_accuracy=0.9\
+ --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+
+
+exit ${EXIT_STATUS}
diff --git a/cv/classification/resnet50/tensorflow/run_train_resnet50_multigpu_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_resnet50_multigpu_imagenette.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d2325ad0330446efb41ee6476de78551cc49c4ad
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/run_train_resnet50_multigpu_imagenette.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+bash ./get_imagenette.sh
+
+export TF_CUDNN_USE_AUTOTUNE=1
+export TF_CPP_MIN_LOG_LEVEL=1
+
+: ${BATCH_SIZE:=32}
+#TRAIN_EPOCHS=10
+# optional optimizer: adam, rmsprop, momentum, sgd
+OPTIMIZER=adam
+DATE=`date +%Y%m%d%H%M%S`
+
+LOG_DIR="logs/resnet50_multigpu"
+DATA_DIR=./imagenette
+BASE_DIR=train_dir
+TRAIN_DIR=${BASE_DIR}/resnet50_multigpu
+
+mkdir -p ${LOG_DIR}
+mkdir -p ${BASE_DIR}
+rm -rf ${TRAIN_DIR}
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+i=0
+for arg in "$@"
+do
+    if [[ $arg =~ "--epoch" ]]; then
+        new_args[$i]="--num_epochs"
+    else
+        new_args[$i]=$arg
+    fi
+    let i++
+done
+
+source ./get_num_devices.sh
+
+UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
+ --data_name=imagenette --data_dir=${DATA_DIR}\
+ --data_format=NCHW --batch_size=${BATCH_SIZE}\
+ --model=resnet50 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\
+ --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\
+ --eval_during_training_every_n_epochs=2\
+ --num_eval_epochs=1 --datasets_use_caching\
+ --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\
+ --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/cv/classification/resnet50/tensorflow/ssd_constants.py b/cv/classification/resnet50/tensorflow/ssd_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..77fa0149b79f827b4e021afa67aa0e9409620e78
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/ssd_constants.py
@@ -0,0 +1,118 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Central location for all constants related to MLPerf SSD."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# ==============================================================================
+# == Model =====================================================================
+# ==============================================================================
+IMAGE_SIZE = 300
+
+# TODO(taylorrobie): MLPerf uses 80, but COCO documents 90. (RetinaNet uses 90)
+# Update(taylorrobie): Labels > 81 show up in the pipeline. This will need to
+#                      be resolved.
+NUM_CLASSES = 81  # Including "no class". Not all COCO classes are used.
+
+# Note: Zero is special. (Background class) CLASS_INV_MAP[0] must be zero.
+CLASS_INV_MAP = (
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+    22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+    44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+    64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87,
+    88, 89, 90)
+_MAP = {j: i for i, j in enumerate(CLASS_INV_MAP)}
+CLASS_MAP = tuple(_MAP.get(i, -1) for i in range(max(CLASS_INV_MAP) + 1))
+
+NUM_SSD_BOXES = 8732
+
+RESNET_DEPTH = 34
+
+"""SSD specific"""
+MIN_LEVEL = 3
+MAX_LEVEL = 8
+
+FEATURE_SIZES = (38, 19, 10, 5, 3, 1)
+STEPS = (8, 16, 32, 64, 100, 300)
+
+# https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
+SCALES = (21, 45, 99, 153, 207, 261, 315)
+ASPECT_RATIOS = ((2,), (2, 3), (2, 3), (2, 3), (2,), (2,))
+NUM_DEFAULTS = (4, 6, 6, 6, 4, 4)
+NUM_DEFAULTS_BY_LEVEL = {3: 4, 4: 6, 5: 6, 6: 6, 7: 4, 8: 4}
+SCALE_XY = 0.1
+SCALE_HW = 0.2
+BOX_CODER_SCALES = (1 / SCALE_XY, 1 / SCALE_XY, 1 / SCALE_HW, 1 / SCALE_HW)
+MATCH_THRESHOLD = 0.5
+
+# https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
+NORMALIZATION_MEAN = (0.485, 0.456, 0.406)
+NORMALIZATION_STD = (0.229, 0.224, 0.225)
+
+# SSD Cropping
+NUM_CROP_PASSES = 50
+CROP_MIN_IOU_CHOICES = (0, 0.1, 0.3, 0.5, 0.7, 0.9)
+P_NO_CROP_PER_PASS = 1 / (len(CROP_MIN_IOU_CHOICES) + 1)
+
+# Hard example mining
+NEGS_PER_POSITIVE = 3
+
+# Batch normalization
+BATCH_NORM_DECAY = 0.997
+BATCH_NORM_EPSILON = 1e-4
+
+
+# ==============================================================================
+# == Optimizer =================================================================
+# ==============================================================================
+LEARNING_RATE_SCHEDULE = (
+    (0, 1e-3),
+    (160000, 1e-4),
+    (200000, 1e-5),
+)
+MOMENTUM = 0.9
+WEIGHT_DECAY = 5e-4
+
+
+# ==============================================================================
+# == Keys ======================================================================
+# ==============================================================================
+BOXES = "boxes"
+CLASSES = "classes"
+NUM_MATCHED_BOXES = "num_matched_boxes"
+IMAGE = "image"
+SOURCE_ID = "source_id"
+RAW_SHAPE = "raw_shape"
+PRED_BOXES = "pred_boxes"
+PRED_SCORES = "pred_scores"
+
+
+# ==============================================================================
+# == Evaluation ================================================================
+# ==============================================================================
+
+# Note: This is based on a batch size of 32
+#   https://github.com/mlperf/reference/blob/master/single_stage_detector/ssd/train.py#L21-L37
+CHECKPOINT_FREQUENCY = 20000
+MAX_NUM_EVAL_BOXES = 200
+OVERLAP_CRITERIA = 0.5  # Used for nonmax supression
+MIN_SCORE = 0.05  # Minimum score to be considered during evaluation.
+DUMMY_SCORE = -1e5  # If no boxes are matched.
+
+ANNOTATION_FILE = "annotations/instances_val2017.json"
+COCO_NUM_TRAIN_IMAGES = 118287
+COCO_NUM_VAL_IMAGES = 4952
diff --git a/cv/classification/resnet50/tensorflow/ssd_dataloader.py b/cv/classification/resnet50/tensorflow/ssd_dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..907d30903735d5181abbf18b02118a5eec2540ab
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/ssd_dataloader.py
@@ -0,0 +1,405 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data loader and processing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools as it
+import math
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+from object_detection.box_coders import faster_rcnn_box_coder
+from object_detection.core import box_list
+from object_detection.core import region_similarity_calculator
+from object_detection.core import target_assigner
+from object_detection.matchers import argmax_matcher
+import mlperf
+import ssd_constants
+
+
+class DefaultBoxes(object):
+  """Default bounding boxes for 300x300 5 layer SSD.
+
+  Default bounding boxes generation follows the order of (W, H, anchor_sizes).
+  Therefore, the tensor converted from DefaultBoxes has a shape of
+  [anchor_sizes, H, W, 4]. The last dimension is the box coordinates; 'ltrb'
+  is [ymin, xmin, ymax, xmax] while 'xywh' is [cy, cx, h, w].
+  """
+
+  def __init__(self):
+    fk = ssd_constants.IMAGE_SIZE / np.array(ssd_constants.STEPS)
+
+    self.default_boxes = []
+    # size of feature and number of feature
+    for idx, feature_size in enumerate(ssd_constants.FEATURE_SIZES):
+      sk1 = ssd_constants.SCALES[idx] / ssd_constants.IMAGE_SIZE
+      sk2 = ssd_constants.SCALES[idx+1] / ssd_constants.IMAGE_SIZE
+      sk3 = math.sqrt(sk1*sk2)
+      all_sizes = [(sk1, sk1), (sk3, sk3)]
+
+      for alpha in ssd_constants.ASPECT_RATIOS[idx]:
+        w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha)
+        all_sizes.append((w, h))
+        all_sizes.append((h, w))
+
+      assert len(all_sizes) == ssd_constants.NUM_DEFAULTS[idx]
+
+      for w, h in all_sizes:
+        for i, j in it.product(range(feature_size), repeat=2):
+          cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
+          box = tuple(np.clip(k, 0, 1) for k in (cy, cx, h, w))
+          self.default_boxes.append(box)
+
+    assert len(self.default_boxes) == ssd_constants.NUM_SSD_BOXES
+
+    mlperf.logger.log(key=mlperf.tags.FEATURE_SIZES,
+                      value=ssd_constants.FEATURE_SIZES)
+    mlperf.logger.log(key=mlperf.tags.STEPS,
+                      value=ssd_constants.STEPS)
+    mlperf.logger.log(key=mlperf.tags.SCALES,
+                      value=ssd_constants.SCALES)
+    mlperf.logger.log(key=mlperf.tags.ASPECT_RATIOS,
+                      value=ssd_constants.ASPECT_RATIOS)
+    mlperf.logger.log(key=mlperf.tags.NUM_DEFAULTS,
+                      value=ssd_constants.NUM_SSD_BOXES)
+
+    def to_ltrb(cy, cx, h, w):
+      return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2
+
+    # For IoU calculation
+    self.default_boxes_ltrb = tuple(to_ltrb(*i) for i in self.default_boxes)
+
+  def __call__(self, order='ltrb'):
+    if order == 'ltrb': return self.default_boxes_ltrb
+    if order == 'xywh': return self.default_boxes
+
+
+def calc_iou_tensor(boxes1, boxes2):
+  """Calculation of IoU based on two boxes tensor.
+
+  Reference to https://github.com/kuangliu/pytorch-ssd
+
+  Args:
+    boxes1: shape (N, 4), four coordinates of N boxes
+    boxes2: shape (M, 4), four coordinates of M boxes
+  Returns:
+    IoU: shape (N, M), IoU of the i-th box in `boxes1` and j-th box in `boxes2`
+  """
+  b1_left, b1_top, b1_right, b1_bottom = tf.split(boxes1, 4, axis=1)
+  b2_left, b2_top, b2_right, b2_bottom = tf.split(boxes2, 4, axis=1)
+
+  # Shape of intersect_* (N, M)
+  intersect_left = tf.maximum(b1_left, tf.transpose(b2_left))
+  intersect_top = tf.maximum(b1_top, tf.transpose(b2_top))
+  intersect_right = tf.minimum(b1_right, tf.transpose(b2_right))
+  intersect_bottom = tf.minimum(b1_bottom, tf.transpose(b2_bottom))
+
+  boxes1_area = (b1_right - b1_left) * (b1_bottom - b1_top)
+  boxes2_area = (b2_right - b2_left) * (b2_bottom - b2_top)
+
+  intersect = tf.multiply(tf.maximum((intersect_right - intersect_left), 0),
+                          tf.maximum((intersect_bottom - intersect_top), 0))
+  union = boxes1_area + tf.transpose(boxes2_area) - intersect
+  iou = intersect / union
+
+  return iou
+
+
+def ssd_parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+
+  Each Example proto contains the following fields that we care about:
+
+    image/encoded: <JPEG encoded string>
+    image/source_id: tf.string
+    image/height: tf.int64
+    image/width: tf.int64
+    image/object/bbox/xmin: tf.VarLenFeature(tf.float32)
+    image/object/bbox/xmax: tf.VarLenFeature(tf.float32)
+    image/object/bbox/ymin: tf.VarLenFeature(tf.float32
+    image/object/bbox/ymax: tf.VarLenFeature(tf.float32)
+    image/object/class/label: tf.VarLenFeature(tf.int64)
+    image/object/class/text: tf.VarLenFeature(tf.string)
+
+  Complete decoder can be found in:
+  https://github.com/tensorflow/models/blob/master/research/object_detection/data_decoders/tf_example_decoder.py
+
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+
+  Returns:
+    A dictionary with the following key-values:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    groundtruth_boxes: Tensor tf.float32 of shape [num_boxes, 4], containing
+      coordinates of object bounding boxes.
+    groundtruth_classeS: Tensor tf.int64 of shape [num_boxes, 1], containing
+      class labels of objects.
+    source_id: unique image identifier.
+    raw_shape: [height, width, 3].
+  """
+  feature_map = {
+      'image/encoded': tf.FixedLenFeature(
+          (), dtype=tf.string, default_value=''),
+      'image/source_id': tf.FixedLenFeature((), tf.string, default_value=''),
+      'image/height': tf.FixedLenFeature((), tf.int64, default_value=1),
+      'image/width': tf.FixedLenFeature((), tf.int64, default_value=1),
+      'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
+      'image/object/class/label': tf.VarLenFeature(dtype=tf.int64),
+  }
+  features = tf.parse_single_example(example_serialized, feature_map)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 1)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 1)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 1)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 1)
+
+  image_buffer = features['image/encoded']
+  # Bounding box coordinates should be in ltrb order
+  boxes = tf.concat([ymin, xmin, ymax, xmax], 1)
+  classes = tf.expand_dims(features['image/object/class/label'].values, 1)
+  source_id = features['image/source_id']
+  raw_shape = tf.stack([features['image/height'], features['image/width'], 3])
+
+  return {'image_buffer': image_buffer,
+          'groundtruth_boxes': boxes,
+          'groundtruth_classes': classes,
+          'source_id': source_id,
+          'raw_shape': raw_shape}
+
+
+def ssd_decode_and_crop(image_buffer, boxes, classes, raw_shape):
+  """Crop image randomly and decode the cropped region.
+
+  This function will crop an image to meet the following requirements:
+  1. height to width ratio between 0.5 and 2;
+  2. IoUs of some boxes exceed specified threshold;
+  3. At least one box center is in the cropped region.
+  We defer the jpeg decoding task until after the crop to avoid wasted work.
+
+  Reference: https://github.com/chauhan-utk/ssd.DomainAdaptation
+
+  Args:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    boxes: Tensor tf.float32 of shape [num_boxes, 4], containing coordinates of
+      object bounding boxes.
+    classes: Tensor tf.int64 of shape [num_boxes, 1], containing class labels
+      of objects.
+    raw_shape: [height, width, 3].
+
+  Returns:
+    resized_image: decoded, cropped, and resized image Tensor tf.float32 of
+      shape [ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE, 3], value
+      range 0--255.
+    cropped_boxes: box coordinates for objects in the cropped region.
+    cropped_classes: class labels for objects in the cropped region.
+  """
+
+  num_boxes = tf.shape(boxes)[0]
+
+  def no_crop_check():
+    return (tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32)
+            < ssd_constants.P_NO_CROP_PER_PASS)
+
+  def no_crop_proposal():
+    return (
+        tf.ones((), tf.bool),
+        tf.convert_to_tensor([0, 0, 1, 1], dtype=tf.float32),
+        tf.ones((num_boxes,), tf.bool),
+    )
+
+  def crop_proposal():
+    rand_vec = lambda minval, maxval: tf.random_uniform(
+        shape=(ssd_constants.NUM_CROP_PASSES, 1), minval=minval, maxval=maxval,
+        dtype=tf.float32)
+
+    width, height = rand_vec(0.3, 1), rand_vec(0.3, 1)
+    left, top = rand_vec(0, 1-width), rand_vec(0, 1-height)
+
+    right = left + width
+    bottom = top + height
+
+    ltrb = tf.concat([left, top, right, bottom], axis=1)
+
+    min_iou = tf.random_shuffle(ssd_constants.CROP_MIN_IOU_CHOICES)[0]
+    ious = calc_iou_tensor(ltrb, boxes)
+
+    # discard any bboxes whose center not in the cropped image
+    xc, yc = [tf.tile(0.5 * (boxes[:, i + 0] + boxes[:, i + 2])[tf.newaxis, :],
+                      (ssd_constants.NUM_CROP_PASSES, 1)) for i in range(2)]
+
+    masks = tf.reduce_all(tf.stack([
+        tf.greater(xc, tf.tile(left, (1, num_boxes))),
+        tf.less(xc, tf.tile(right, (1, num_boxes))),
+        tf.greater(yc, tf.tile(top, (1, num_boxes))),
+        tf.less(yc, tf.tile(bottom, (1, num_boxes))),
+    ], axis=2), axis=2)
+
+    # Checks of whether a crop is valid.
+    valid_aspect = tf.logical_and(tf.less(height/width, 2),
+                                  tf.less(width/height, 2))
+    valid_ious = tf.reduce_all(tf.greater(ious, min_iou), axis=1, keepdims=True)
+    valid_masks = tf.reduce_any(masks, axis=1, keepdims=True)
+
+    valid_all = tf.cast(tf.reduce_all(tf.concat(
+        [valid_aspect, valid_ious, valid_masks], axis=1), axis=1), tf.int32)
+
+    # One indexed, as zero is needed for the case of no matches.
+    index = tf.range(1, 1 + ssd_constants.NUM_CROP_PASSES, dtype=tf.int32)
+
+    # Either one-hot, or zeros if there is no valid crop.
+    selection = tf.equal(tf.reduce_max(index * valid_all), index)
+
+    use_crop = tf.reduce_any(selection)
+    output_ltrb = tf.reduce_sum(tf.multiply(ltrb, tf.tile(tf.cast(
+        selection, tf.float32)[:, tf.newaxis], (1, 4))), axis=0)
+    output_masks = tf.reduce_any(tf.logical_and(masks, tf.tile(
+        selection[:, tf.newaxis], (1, num_boxes))), axis=0)
+
+    return use_crop, output_ltrb, output_masks
+
+  def proposal(*args):
+    return tf.cond(
+        pred=no_crop_check(),
+        true_fn=no_crop_proposal,
+        false_fn=crop_proposal,
+    )
+
+  _, crop_bounds, box_masks = tf.while_loop(
+      cond=lambda x, *_: tf.logical_not(x),
+      body=proposal,
+      loop_vars=[tf.zeros((), tf.bool), tf.zeros((4,), tf.float32), tf.zeros((num_boxes,), tf.bool)],
+  )
+
+  filtered_boxes = tf.boolean_mask(boxes, box_masks, axis=0)
+
+  mlperf.logger.log(key=mlperf.tags.NUM_CROPPING_ITERATIONS,
+                    value=ssd_constants.NUM_CROP_PASSES)
+
+  # Clip boxes to the cropped region.
+  filtered_boxes = tf.stack([
+      tf.maximum(filtered_boxes[:, 0], crop_bounds[0]),
+      tf.maximum(filtered_boxes[:, 1], crop_bounds[1]),
+      tf.minimum(filtered_boxes[:, 2], crop_bounds[2]),
+      tf.minimum(filtered_boxes[:, 3], crop_bounds[3]),
+  ], axis=1)
+
+  left = crop_bounds[0]
+  top = crop_bounds[1]
+  width = crop_bounds[2] - left
+  height = crop_bounds[3] - top
+
+  cropped_boxes = tf.stack([
+      (filtered_boxes[:, 0] - left) / width,
+      (filtered_boxes[:, 1] - top) / height,
+      (filtered_boxes[:, 2] - left) / width,
+      (filtered_boxes[:, 3] - top) / height,
+  ], axis=1)
+
+  # crop_window containing integer coordinates of cropped region. A normalized
+  # coordinate value of y should be mapped to the image coordinate at
+  # y * (height - 1).
+  raw_shape = tf.cast(raw_shape, tf.float32)
+  crop_window = tf.stack([left * (raw_shape[0] - 1),
+                          top * (raw_shape[1] - 1),
+                          width * raw_shape[0],
+                          height * raw_shape[1]])
+  crop_window = tf.cast(crop_window, tf.int32)
+
+  # Fused op only decodes the cropped portion of an image
+  cropped_image = tf.image.decode_and_crop_jpeg(
+      image_buffer, crop_window, channels=3)
+
+  # Resize converts image dtype from uint8 to float32, without rescaling values.
+  resized_image = tf.image.resize_images(
+      cropped_image, [ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE])
+  mlperf.logger.log(key=mlperf.tags.INPUT_SIZE,
+                    value=ssd_constants.IMAGE_SIZE)
+
+  cropped_classes = tf.boolean_mask(classes, box_masks, axis=0)
+
+  return resized_image, cropped_boxes, cropped_classes
+
+
+def color_jitter(image, brightness=0, contrast=0, saturation=0, hue=0):
+  """Distort the color of the image."""
+  with tf.name_scope('distort_color'):
+    if brightness > 0:
+      image = tf.image.random_brightness(image, max_delta=brightness)
+    if contrast > 0:
+      image = tf.image.random_contrast(
+          image, lower=1-contrast, upper=1+contrast)
+    if saturation > 0:
+      image = tf.image.random_saturation(
+          image, lower=1-saturation, upper=1+saturation)
+    if hue > 0:
+      image = tf.image.random_hue(image, max_delta=hue)
+    return image
+
+
+def normalize_image(images):
+  """Normalize image to zero mean and unit variance.
+
+  Args:
+    images: a tensor representing images, at least 3-D.
+  Returns:
+    images normalized by mean and stdev.
+  """
+  data_type = images.dtype
+  mean = tf.constant(ssd_constants.NORMALIZATION_MEAN, data_type)
+  std = tf.constant(ssd_constants.NORMALIZATION_STD, data_type)
+  images = tf.divide(tf.subtract(images, mean), std)
+
+  mlperf.logger.log(key=mlperf.tags.DATA_NORMALIZATION_MEAN,
+                    value=ssd_constants.NORMALIZATION_MEAN)
+  mlperf.logger.log(key=mlperf.tags.DATA_NORMALIZATION_STD,
+                    value=ssd_constants.NORMALIZATION_STD)
+  return images
+
+
+class Encoder(object):
+  """Encoder for SSD boxes and labels."""
+
+  def __init__(self):
+    similarity_calc = region_similarity_calculator.IouSimilarity()
+    matcher = argmax_matcher.ArgMaxMatcher(
+        matched_threshold=ssd_constants.MATCH_THRESHOLD,
+        unmatched_threshold=ssd_constants.MATCH_THRESHOLD,
+        negatives_lower_than_unmatched=True,
+        force_match_for_each_row=True)
+
+    box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
+        scale_factors=ssd_constants.BOX_CODER_SCALES)
+
+    self.default_boxes = DefaultBoxes()('ltrb')
+    self.default_boxes = box_list.BoxList(
+        tf.convert_to_tensor(self.default_boxes))
+    self.assigner = target_assigner.TargetAssigner(
+        similarity_calc, matcher, box_coder)
+
+  def encode_labels(self, gt_boxes, gt_labels):
+    target_boxes = box_list.BoxList(gt_boxes)
+    encoded_classes, _, encoded_boxes, _, matches = self.assigner.assign(
+        self.default_boxes, target_boxes, gt_labels)
+    num_matched_boxes = tf.reduce_sum(
+        tf.cast(tf.not_equal(matches, -1), tf.float32))
+    return encoded_classes, encoded_boxes, num_matched_boxes
diff --git a/cv/classification/resnet50/tensorflow/test_data/__init__.py b/cv/classification/resnet50/tensorflow/test_data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00000-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00000-of-00008
new file mode 100644
index 0000000000000000000000000000000000000000..4e65b92a9a5f252f7b1a9d9048e834217f468971
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00000-of-00008 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00001-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00001-of-00008
new file mode 100644
index 0000000000000000000000000000000000000000..1cf1fec734f3d6bfd74a6e38ac7b0f43d24eaaab
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00001-of-00008 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00002-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00002-of-00008
new file mode 100644
index 0000000000000000000000000000000000000000..631ca95c9e17362c498b71979466661ec7ce4be5
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00002-of-00008 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00003-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00003-of-00008
new file mode 100644
index 0000000000000000000000000000000000000000..addfd3424c00e4596de3bfa77751c1fd891164ba
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00003-of-00008 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00004-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00004-of-00008
new file mode 100644
index 0000000000000000000000000000000000000000..32818ec981b1b648ea605e351012c4e58a075454
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00004-of-00008 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00005-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00005-of-00008
new file mode 100644
index 0000000000000000000000000000000000000000..5f4e651519673b3b61726b5a3b0d21a8c962deb5
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00005-of-00008 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00006-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00006-of-00008
new file mode 100644
index 0000000000000000000000000000000000000000..042a71fd169745357111f2f0de84f42e52849b2a
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00006-of-00008 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00007-of-00008 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00007-of-00008
new file mode 100644
index 0000000000000000000000000000000000000000..a6d9ce627d88ec39344fbd6aae7badd629c5e54c
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/train-00007-of-00008 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00000-of-00002 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00000-of-00002
new file mode 100644
index 0000000000000000000000000000000000000000..1c7757759bad5f59007b429adb520fdb5eed4068
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00000-of-00002 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00001-of-00002 b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00001-of-00002
new file mode 100644
index 0000000000000000000000000000000000000000..e0f379cab43b5fa46f6f232e93c1deba2548f7a1
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/fake_tf_record_data/validation-00001-of-00002 differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/images/black_image.jpg b/cv/classification/resnet50/tensorflow/test_data/images/black_image.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..312873be3bd305bfb5962896ea8ae507ca44b572
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/images/black_image.jpg differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/images/white_image.jpg b/cv/classification/resnet50/tensorflow/test_data/images/white_image.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ad96f25af79ca0d683642c3dbef1049cc7061f84
Binary files /dev/null and b/cv/classification/resnet50/tensorflow/test_data/images/white_image.jpg differ
diff --git a/cv/classification/resnet50/tensorflow/test_data/tfrecord_image_generator.py b/cv/classification/resnet50/tensorflow/test_data/tfrecord_image_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f0b9102134456fefd7b712c9e1d734c13a0b9e2
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/test_data/tfrecord_image_generator.py
@@ -0,0 +1,226 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate black and white test TFRecords with Example protos.
+
+Each record within the TFRecord file is a
+serialized Example proto. The Example proto contains the following fields:
+
+  image/encoded: string containing JPEG encoded image in RGB colorspace
+  image/height: integer, image height in pixels
+  image/width: integer, image width in pixels
+  image/colorspace: string, specifying the colorspace, always 'RGB'
+  image/channels: integer, specifying the number of channels, always 3
+  image/format: string, specifying the format, always'JPEG'
+
+  image/filename: string containing the basename of the image file
+            e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
+  image/class/label: integer specifying the index in a classification layer.
+    The label ranges from [1, 1000] where 0 is not used.
+  image/class/synset: string specifying the unique ID of the label,
+    e.g. 'n01440764'
+  image/class/text: string specifying the human-readable version of the label
+    e.g. 'red fox, Vulpes vulpes'
+
+  image/object/bbox/xmin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/xmax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/label: integer specifying the index in a classification
+    layer. The label ranges from [1, 1000] where 0 is not used. Note this is
+    always identical to the image label.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+
+
+def _int64_feature(value):
+  """Wrapper for inserting int64 features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def _float_feature(value):
+  """Wrapper for inserting float features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+
+def _bytes_feature(value):
+  """Wrapper for inserting bytes features into Example proto."""
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _convert_to_example(filename, image_buffer, label, synset, human, bbox,
+                        height, width):
+  """Build an Example proto for an example.
+
+  Args:
+    filename: string, path to an image file, e.g., '/path/to/example.JPG'
+    image_buffer: bytes, JPEG encoding of RGB image
+    label: integer, identifier for the ground truth for the network
+    synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
+    human: string, human-readable label, e.g., 'red fox, Vulpes vulpes'
+    bbox: list of bounding boxes; each box is a list of integers
+      specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to
+      the same label as the image label.
+    height: integer, image height in pixels
+    width: integer, image width in pixels
+  Returns:
+    Example proto
+  """
+  xmin = []
+  ymin = []
+  xmax = []
+  ymax = []
+  for b in bbox:
+    assert len(b) == 4
+    # pylint: disable=expression-not-assigned
+    [l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)]
+    # pylint: enable=expression-not-assigned
+
+  colorspace = b'RGB'
+  channels = 3
+  image_format = b'JPEG'
+
+  example = tf.train.Example(features=tf.train.Features(feature={
+      'image/height': _int64_feature(height),
+      'image/width': _int64_feature(width),
+      'image/colorspace': _bytes_feature(colorspace),
+      'image/channels': _int64_feature(channels),
+      'image/class/label': _int64_feature(label),
+      'image/class/synset': _bytes_feature(six.ensure_binary(synset)),
+      'image/class/text': _bytes_feature(six.ensure_binary(human)),
+      'image/object/bbox/xmin': _float_feature(xmin),
+      'image/object/bbox/xmax': _float_feature(xmax),
+      'image/object/bbox/ymin': _float_feature(ymin),
+      'image/object/bbox/ymax': _float_feature(ymax),
+      'image/object/bbox/label': _int64_feature([label] * len(xmin)),
+      'image/format': _bytes_feature(image_format),
+      'image/filename': _bytes_feature(os.path.basename(six.ensure_binary(
+          filename))),
+      'image/encoded': _bytes_feature(image_buffer)}))
+  return example
+
+
+class ImageCoder(object):
+  """Helper class that provides TensorFlow image coding utilities."""
+
+  def __init__(self):
+    # Create a single Session to run all image coding calls.
+    self._sess = tf.Session()
+
+    # Initializes function that converts PNG to JPEG data.
+    self._image = tf.placeholder(dtype=tf.uint8)
+    self._encode_jpeg = tf.image.encode_jpeg(
+        self._image, format='rgb', quality=100)
+
+  def encode_jpeg(self, image):
+    jpeg_image = self._sess.run(self._encode_jpeg,
+                                feed_dict={self._image: image})
+    return jpeg_image
+
+
+def _process_image(coder, name):
+  """Process a single image file.
+
+  If name is "train", a black image is returned. Otherwise, a white image is
+  returned.
+
+  Args:
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+    name: string, unique identifier specifying the data set.
+  Returns:
+    image_buffer: bytes, JPEG encoding of RGB image.
+    height: integer, image height in pixels.
+    width: integer, image width in pixels.
+  """
+  # Read the image file.
+  value = 0 if name == 'train' else 255
+  height = random.randint(30, 299)
+  width = random.randint(30, 299)
+  image = np.full((height, width, 3), value, np.uint8)
+
+  jpeg_data = coder.encode_jpeg(image)
+
+  return jpeg_data, height, width
+
+
+def _process_dataset(output_directory, num_classes, coder, name, num_images,
+                     num_shards):
+  """Process a complete data set and save it as a TFRecord.
+
+  Args:
+    output_directory: Where to put outputs.
+    num_classes: number of classes.
+    coder: Instance of an ImageCoder.
+    name: string, unique identifier specifying the data set.
+    num_images: number of images to generate.
+    num_shards: integer number of shards to create.
+  """
+  files_per_shard = num_images // num_shards
+  for shard in range(num_shards):
+    output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
+    output_file = os.path.join(output_directory, output_filename)
+    with tf.python_io.TFRecordWriter(output_file) as writer:
+      for i in range(files_per_shard):
+        index = shard * files_per_shard + i
+        image_buffer, height, width = _process_image(coder, name)
+
+        filename = '{}_{}_{}'.format(name, shard, i)
+        label = index % num_classes
+        synset = str(index)
+        human = name
+        bbox = [[0.1, 0.1, 0.9, 0.9]]
+        example = _convert_to_example(filename, image_buffer, label,
+                                      synset, human, bbox,
+                                      height, width)
+        writer.write(example.SerializeToString())
+
+
+def write_black_and_white_tfrecord_data(
+    output_directory, num_classes, num_train_images=512,
+    num_validation_images=128, train_shards=8, validation_shards=2):
+  """Writes black and white images in tfrecord format.
+
+  Training images are black and validation images are white.
+
+  Args:
+    output_directory: Where to put outputs.
+    num_classes: number of classes.
+    num_train_images: number of training images to generate.
+    num_validation_images: number of validation images to generate.
+    train_shards: integer number of training shards to create.
+    validation_shards: integer number of validation shards to create.
+  """
+
+  coder = ImageCoder()
+  _process_dataset(output_directory, num_classes, coder, 'validation',
+                   num_validation_images, validation_shards)
+  _process_dataset(output_directory, num_classes, coder, 'train',
+                   num_train_images, train_shards)
diff --git a/cv/classification/resnet50/tensorflow/test_util.py b/cv/classification/resnet50/tensorflow/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccb930a6b1e2fba3285dc2e14cfd0a3fba85ce4b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/test_util.py
@@ -0,0 +1,532 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Shared functionality across multiple test files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from collections import namedtuple
+from contextlib import contextmanager
+import os
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+import benchmark_cnn
+import cnn_util
+import datasets
+import preprocessing
+from models import model
+from platforms import util as platforms_util
+from test_data import tfrecord_image_generator
+from tensorflow.core.protobuf import rewriter_config_pb2  # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.platform import test
+
+
+@contextmanager
+def monkey_patch(obj, **kwargs):
+  """Context mgr to monkey patch attributes on an object (such as a module).
+
+  The attributes are patched back to their original value when the context
+  manager exits.
+
+  For example, to replace benchmark_cnn.get_data_type with an identity function,
+  do:
+
+  ```
+  with monkey_patch(benchmark_cnn, get_data_type=lambda x: x)
+    loss1 = benchmark_cnn.loss_function(1)  # loss1 will be 1
+  loss2 = benchmark_cnn.loss_function(params)  # Call the original function
+  ```
+
+  Args:
+    obj: The object (which can be a module) to monkey patch attributes on.
+    **kwargs: Dictionary mapping from attribute name to value that the attribute
+      will be patched with.
+  Yields:
+    Nothing.
+  """
+  old_values = {key: getattr(obj, key) for key in kwargs}
+  try:
+    for key, value in kwargs.items():
+      setattr(obj, key, value)
+    yield
+  finally:
+    for key, value in old_values.items():
+      setattr(obj, key, value)
+
+
+def monkey_patch_base_cluster_manager():
+  """Monkey patches get_cluster_manager to return a BaseClusterManager.
+
+  This function replaces platforms_util.get_cluster_manager with a function that
+  always return a BaseClusterManager.
+
+  This is useful for testing creating a graph in distributed mode, with only a
+  single process. GrpcClusterManager's constructor blocks until a cluster is set
+  up, which requires multiple processes to be created.
+  """
+  def get_test_cluster_manager(params, config_proto):
+    del config_proto
+    return cnn_util.BaseClusterManager(params)
+  platforms_util.get_cluster_manager = get_test_cluster_manager
+
+
+def print_and_add_to_list(print_list):
+  """Returns a function which prints the input, then adds it to print_list."""
+  def f(string):
+    print(string)
+    print_list.append(string)
+  return f
+
+
+TrainingOutput = namedtuple('TrainingOutput',
+                            ['loss', 'top_1_accuracy', 'top_5_accuracy'])
+
+
+EvalOutput = namedtuple('EvalOutput', ['top_1_accuracy', 'top_5_accuracy'])
+
+
+def get_training_outputs_from_logs(logs, print_training_accuracy):
+  """Returns a list of TrainingOutputs by parsing the logs of a training run.
+
+  Args:
+    logs: A list of strings, each which is a line from the standard output of
+      tf_cnn_benchmarks from training. Only lines in the form:
+        10 images/sec: 14.2 +/- 0.0 (jitter = 0.0) 7.020
+      are parsed (the line may also contain the training accuracies).
+    print_training_accuracy: The value of the param print_training_accuracy.
+  Returns:
+    A list of TrainingOutputs. The list has one element per element of logs
+    that is in the format above. top_1_accuracy and top_5_accuracy are set to -1
+    if the line does not contain accuracies.
+  """
+  outputs = []
+  for log in logs:
+    if 'images/sec' in log and '+/-' in log:
+      parts = log.split()
+      if print_training_accuracy:
+        # Example log with training accuracy:
+        #   10 images/sec: 0.2 +/- 0.0 (jitter = 0.0) 6.908 0.500 1.000
+        assert len(parts) == 11
+        top_1_acc = float(parts[9])
+        top_5_acc = float(parts[10])
+      else:
+        # Example log without training accuracy:
+        #   10 images/sec: 0.2 +/- 0.0 (jitter = 0.0) 6.908
+        assert len(parts) == 9
+        top_1_acc = -1
+        top_5_acc = -1
+      loss = float(parts[8])
+      outputs.append(TrainingOutput(loss=loss, top_1_accuracy=top_1_acc,
+                                    top_5_accuracy=top_5_acc))
+  assert len(outputs) >= 1
+  return outputs
+
+
+def get_evaluation_outputs_from_logs(logs):
+  """Returns the top 1 and 5 accuracies by parsing the logs of an eval run.
+
+  Args:
+    logs: A list of strings, each which is a line from the standard output of
+      tf_cnn_benchmarks from evaluation. Only lines in the form:
+        Accuracy @ 1 = 0.5000 Accuracy @ 5 = 1.0000 [80 examples]
+      is parsed.
+  Returns:
+    A list of EvalOutputs. Normally this list only has one EvalOutput, but can
+    contain multiple if training is done and
+    --eval_during_training_every_n_steps is specified.
+  """
+  eval_outputs = []
+  for log in logs:
+    if 'Accuracy @ ' in log:
+      # Example log:
+      #   Accuracy @ 1 = 0.5000 Accuracy @ 5 = 1.0000 [80 examples]
+      parts = log.split()
+      assert len(parts) == 12
+      top_1_accuracy = float(parts[4])
+      top_5_accuracy = float(parts[9])
+      eval_outputs.append(EvalOutput(top_1_accuracy, top_5_accuracy))
+  assert eval_outputs
+  return eval_outputs
+
+
+def check_training_outputs_are_reasonable(testcase, training_outputs,
+                                          print_training_accuracy,
+                                          max_final_loss=10.,
+                                          previous_final_loss=None):
+  """Checks the outputs from training a model are reasonable.
+
+  An assert is failed if the outputs are not reasonable. The final top-1 and
+  top-5 accuracies are asserted to be 1, and so the dataset used to train should
+  be trivial to learn. For example, the dataset could consist of a black image
+  with label 0 and a white image with label 1.
+
+  Args:
+    testcase: A tf.test.TestCase used for assertions.
+    training_outputs: A list of TrainingOutputs, as returned from
+      get_training_outputs_from_logs().
+    print_training_accuracy: Whether training accuracies were printed and stored
+      in training_outputs.
+    max_final_loss: The loss of the final training output is asserted to be at
+      most this value.
+    previous_final_loss: If training was resumed from a checkpoint, the loss of
+      the final step from the previous training run that saved the checkpoint.
+  """
+  if previous_final_loss is not None:
+    # Ensure the loss hasn't raised significantly from the final loss of the
+    # previous training run.
+    testcase.assertLessEqual(training_outputs[0].loss,
+                             previous_final_loss * 1.01)
+  for output in training_outputs:
+    testcase.assertLessEqual(output.loss, 100.)
+  last_output = training_outputs[-1]
+  if print_training_accuracy:
+    testcase.assertEqual(last_output.top_1_accuracy, 1.0)
+    testcase.assertEqual(last_output.top_5_accuracy, 1.0)
+  if max_final_loss is not None:
+    testcase.assertLessEqual(last_output.loss, max_final_loss)
+
+
+def train_and_eval(testcase,
+                   run_fn,
+                   params,
+                   check_output_values,
+                   max_final_loss=10.,
+                   skip=None):
+  """Trains a model then evaluates it.
+
+  This function should be used to verify training and evaluating
+  BenchmarkCNN works without crashing and that it outputs reasonable
+  values. BenchmarkCNN will be run three times. First, it will train a
+  model from scratch, saving a checkpoint. Second, it will load the checkpoint
+  to continue training. Finally, it evaluates based on the loaded checkpoint.
+
+  Args:
+    testcase: A tf.test.TestCase used for assertions.
+    run_fn: Must run `BenchmarkCNN` exactly once. BenchmarkCNN is
+      never used directly, but instead is only run through `run_fn`. `run_fn`
+      has the signature (run_type, inner_params) -> output_list, where:
+        * run_type is a string indicating how BenchmarkCNN will be run.
+          Either 'InitialTraining', 'TrainingFromCheckpoint' or 'Evaluation'.
+        * inner_params is the params BenchmarkCNN should be run with.
+        * output_list[i] is a list of lines from the ith worker's stdout.
+    params: The params BenchmarkCNN will be run with.
+      Will be passed to `run_fn` slightly modified in order to run with both
+      training and evaluation.
+    check_output_values: Whether the outputs of the workers, such as training
+      accuracy, should be checked to make sure their values are reasonable.
+      Fails an assert on `testcase` if a check fails.
+    max_final_loss: The loss of the final training output is asserted to be at
+      most this value for both training runs.
+    skip: If 'eval', evaluation is not done. if
+      'eval_and_train_from_checkpoint', evaluation and training from a
+      checkpoint are both not done.
+  """
+
+  assert not skip or skip in {'eval', 'eval_and_train_from_checkpoint'}
+
+  # Part 1: Train from scratch.
+  tf.logging.info('Training model from scratch')
+  print_training_accuracy = (params.print_training_accuracy or
+                             params.forward_only)
+  initial_train_logs = run_fn('InitialTraining', params)
+  testcase.assertGreaterEqual(len(initial_train_logs), 1)
+  for lines in initial_train_logs:
+    initial_train_outputs = get_training_outputs_from_logs(
+        lines, print_training_accuracy)
+    if params.cross_replica_sync and params.batch_group_size == 1:
+      testcase.assertEqual(len(initial_train_outputs), params.num_batches)
+    if check_output_values:
+      check_training_outputs_are_reasonable(testcase, initial_train_outputs,
+                                            print_training_accuracy,
+                                            max_final_loss=max_final_loss)
+  if params.train_dir is not None:
+    train_dir_entries = set(os.listdir(params.train_dir))
+    testcase.assertGreater(len(train_dir_entries), 0)
+  else:
+    train_dir_entries = None
+
+  if skip == 'eval_and_train_from_checkpoint':
+    return
+
+  # Part 2: Train from the loaded checkpoint.
+  testcase.assertIsNotNone(train_dir_entries)
+  tf.logging.info('Training model from loaded checkpoint')
+  # Run for same number of batches as before.
+  params = params._replace(num_batches=params.num_batches * 2)
+  train_logs_from_ckpt = run_fn('TrainingFromCheckpoint', params)
+  testcase.assertGreaterEqual(len(train_logs_from_ckpt), 1)
+  for lines in train_logs_from_ckpt:
+    train_outputs_from_ckpt = get_training_outputs_from_logs(
+        lines, print_training_accuracy)
+    if params.cross_replica_sync and params.batch_group_size == 1:
+      testcase.assertEqual(len(train_outputs_from_ckpt),
+                           params.num_batches // 2 - params.num_warmup_batches)
+    if check_output_values:
+      check_training_outputs_are_reasonable(
+          testcase, train_outputs_from_ckpt, print_training_accuracy,
+          max_final_loss=max_final_loss,
+          previous_final_loss=initial_train_outputs[-1].loss)
+  # Ensure a new checkpoint was written out.
+  testcase.assertNotEqual(train_dir_entries, set(os.listdir(params.train_dir)))
+
+  if skip == 'eval':
+    return
+
+  # Part 3: Evaluate from the loaded checkpoint.
+  tf.logging.info('Evaluating model from checkpoint')
+  params = params._replace(num_batches=params.num_batches // 2, eval=True)
+  eval_logs = run_fn('Evaluation', params)
+  testcase.assertGreaterEqual(len(eval_logs), 1)
+  for lines in eval_logs:
+    eval_outputs = get_evaluation_outputs_from_logs(lines)
+    assert len(eval_outputs) == 1
+    top_1_accuracy, top_5_accuracy = eval_outputs[0]
+    if check_output_values:
+      testcase.assertEqual(top_1_accuracy, 1.0)
+      testcase.assertEqual(top_5_accuracy, 1.0)
+
+
+def get_temp_dir(dir_name):
+  dir_path = os.path.join(test.get_temp_dir(), dir_name)
+  os.mkdir(dir_path)
+  return dir_path
+
+
+def create_black_and_white_images():
+  dir_path = get_temp_dir('black_and_white_images')
+  tfrecord_image_generator.write_black_and_white_tfrecord_data(dir_path,
+                                                               num_classes=1)
+  return dir_path
+
+
+def get_params(train_dir_name):
+  """Returns params that can be used to train."""
+  params = benchmark_cnn.make_params(
+      batch_size=2,
+      display_every=1,
+      init_learning_rate=0.005,
+      model='trivial',
+      num_batches=20,
+      num_gpus=2,
+      num_warmup_batches=5,
+      optimizer='sgd',
+      print_training_accuracy=True,
+      train_dir=get_temp_dir(train_dir_name),
+      variable_update='parameter_server',
+      weight_decay=0,
+      distortions=True,
+      distort_color_in_yiq=False)
+  return benchmark_cnn.set_default_param_values_and_env_vars(params)
+
+
+def get_var_update_params():
+  """Returns params that are used when testing variable updates."""
+  params = benchmark_cnn.make_params(
+      batch_size=2,
+      model='test_model',
+      num_gpus=2,
+      display_every=1,
+      num_warmup_batches=0,
+      num_batches=4,
+      weight_decay=2 ** -4,
+      init_learning_rate=2 ** -4,
+      optimizer='sgd')
+  return benchmark_cnn.set_default_param_values_and_env_vars(params)
+
+
+def get_fake_var_update_inputs():
+  """Returns fake input 1x1 images to use in variable update tests."""
+  # BenchmarkCNN divides by 127.5 then subtracts 1.0 from the images, so after
+  # that, the images will be -1., 0., 1., ..., 14.
+  return np.resize(127.5 * np.array(range(16)), (16, 1, 1, 1))
+
+
+def _worker_batches_in_numpy_array(numpy_inputs, batch_size, shift_ratio):
+  """Yields batches from a numpy array, for a single worker."""
+  numpy_inputs = cnn_util.roll_numpy_batches(numpy_inputs, batch_size,
+                                             shift_ratio)
+  i = 0
+  total_batches = numpy_inputs.shape[0]
+  assert total_batches % batch_size == 0
+  while True:
+    yield numpy_inputs[i:i + batch_size, ...]
+    i = (i + batch_size) % total_batches
+
+
+def manually_compute_losses(numpy_inputs, inputs_placeholder, loss, num_workers,
+                            params):
+  """Manually compute the losses each worker should report in tf_cnn_benchmarks.
+
+  This function essentially simulates tf_cnn_benchmarks, computing what the loss
+  of each worker should be. The caller should create a model, that takes in
+  images from `inputs_placeholder`, a tf.placeholder, and computes `loss`.
+
+  This function, and all ops passed to this function, must be run under a
+  tf.device('cpu:0') context manager.
+
+  Non-SGD optimizers are not supported with multiple workers.
+
+  Args:
+    numpy_inputs: A Numpy array to use as the input images.
+    inputs_placeholder: A tf.placeholder tensor, where input images can be fed
+      into.
+    loss: A scalar tensor representing the loss of the model, which is obtained
+      from the input images in inputs_placeholder.
+    num_workers: How many workers should be simulated.
+    params: Params tuple. This doesn't have to have information about the
+      distributed cluster, such as --num_workers, as num_workers is passed in
+      separately.
+
+  Returns:
+    A list of list of losses. return_value[i][j] is the loss of the ith worker
+    after the jth step.
+  """
+  batch_size = params.batch_size * params.num_gpus
+  assert numpy_inputs.shape[0] % (num_workers * batch_size) == 0
+  l2_loss = tf.add_n([tf.nn.l2_loss(x) for x in tf.trainable_variables()])
+  total_loss = loss + params.weight_decay * l2_loss
+  reported_loss = (loss if params.loss_type_to_report == 'base_loss'
+                   else total_loss)
+  gradient_multiplier = 1
+  if params.variable_update in ('replicated', 'distributed_all_reduce'):
+    # In certain variable updates, tf_cnn_benchmarks add the gradients of the
+    # GPUs instead of taking their mean, making the gradients effectively
+    # params.num_gpu times higher.
+    # TODO(b/62722498): Make all variable updates consistent.
+    gradient_multiplier = params.num_gpus
+
+  opt = benchmark_cnn.get_optimizer(params, params.init_learning_rate)
+  grad_vars = opt.compute_gradients(
+      total_loss, grad_loss=tf.constant(gradient_multiplier, dtype=tf.float32))
+  grads = [g for g, _ in grad_vars]
+  # We apply gradients from a placeholder. That way, we can first compute the
+  # gradients from each worker, then afterwards apply them one by one by feeding
+  # them into the placeholder.
+  placeholder_grad_vars = [(tf.placeholder(g.dtype, g.shape), v)
+                           for g, v in grad_vars]
+  placeholder_grads = [g for g, _ in placeholder_grad_vars]
+  apply_grads_op = opt.apply_gradients(placeholder_grad_vars)
+
+  batch_iterators = [_worker_batches_in_numpy_array(numpy_inputs, batch_size,
+                                                    shift_ratio=i / num_workers)
+                     for i in range(num_workers)]
+  # Set the GPU count to 0, to avoid taking all the GPU memory. Unfortunately,
+  # doing so still takes up about ~1GB for some reason.
+  config = tf.ConfigProto(device_count={'GPU': 0})
+  config.graph_options.rewrite_options.pin_to_host_optimization = (
+      rewriter_config_pb2.RewriterConfig.OFF)
+  with tf.Session(config=config) as sess:
+    sess.run(tf.global_variables_initializer())
+    losses = [[] for _ in range(num_workers)]
+    for i in range(params.num_batches):
+      computed_grads = []
+      for j in range(num_workers):
+        batch_feed = next(batch_iterators[j])
+        batch_feed = batch_feed / 127.5 - 1
+        worker_loss, worker_grads = sess.run((reported_loss, grads),
+                                             {inputs_placeholder: batch_feed})
+        losses[j].append(worker_loss)
+        computed_grads.append(worker_grads)
+      for worker_grads in computed_grads:
+        # TODO(reedwm): With multiple workers, applying the gradients
+        # sequentially per worker is not equivalent to what tf_cnn_benchmarks
+        # does when the optmizer is not SGD. Therefore, this currently does not
+        # work currently when num_workers > 1 and params.optimizer != 'sgd'.
+        feed_dict = dict(zip(placeholder_grads, worker_grads))
+        sess.run(apply_grads_op, feed_dict)
+  return losses
+
+
+class TestCNNModel(model.CNNModel):
+  """A simple model used for testing.
+
+  The input is a 1-channel 1x1 image, consisting of a single number. The model
+  has two scalar variables: A and B, initialized to 1 and 2 respectively. Given
+  an image x, the loss is defined as:
+
+      loss = x * A * B
+  """
+
+  def __init__(self):
+    super(TestCNNModel, self).__init__(
+        'test_cnn_model', image_size=1, batch_size=1, learning_rate=1)
+    self.depth = 1
+
+  VAR_A_INITIAL_VALUE = 1.
+  VAR_B_INITIAL_VALUE = 2.
+
+  def add_inference(self, cnn):
+    # This model only supports 1x1 images with 1 channel
+    assert cnn.top_layer.shape[1:] == (1, 1, 1)
+    # Multiply by variable A.
+    with tf.name_scope('mult_by_var_A'):
+      cnn.conv(1, 1, 1, 1, 1, use_batch_norm=None, activation=None, bias=None,
+               kernel_initializer=tf.constant_initializer(
+                   self.VAR_A_INITIAL_VALUE))
+    # Multiply by variable B.
+    with tf.name_scope('mult_by_var_B'):
+      cnn.conv(1, 1, 1, 1, 1, use_batch_norm=None, activation=None, bias=None,
+               kernel_initializer=tf.constant_initializer(
+                   self.VAR_B_INITIAL_VALUE))
+    with tf.name_scope('reshape_to_scalar'):
+      cnn.reshape([-1, 1])
+
+  def skip_final_affine_layer(self):
+    return True
+
+  def loss_function(self, inputs, build_network_result):
+    del inputs
+    return tf.reduce_mean(build_network_result.logits)
+
+  def manually_compute_losses(self, inputs, num_workers, params):
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+      a = tf.Variable(self.VAR_A_INITIAL_VALUE, name='A')
+      b = tf.Variable(self.VAR_B_INITIAL_VALUE, name='B')
+      inputs_placeholder = tf.placeholder(tf.float32,
+                                          (None, 1, 1, 1),
+                                          name='inputs_placeholder')
+      inputs_reshaped = tf.reshape(inputs_placeholder, (-1, 1))
+      loss = self.loss_function(
+          None,
+          model.BuildNetworkResult(logits=inputs_reshaped * a * b,
+                                   extra_info=None))
+      return manually_compute_losses(inputs, inputs_placeholder, loss,
+                                     num_workers, params)
+
+  def accuracy_function(self, inputs, logits):
+    del inputs
+    # Let the accuracy be the same as the loss function.
+    return {'top_1_accuracy': logits, 'top_5_accuracy': logits}
+
+
+class TestDataSet(datasets.ImageDataset):
+  """A Dataset consisting of 1x1 images with a depth of 1."""
+
+  def __init__(self, height=1, width=1, depth=1):
+    super(TestDataSet, self).__init__('test_dataset', height=height,
+                                      width=width, depth=depth, data_dir=None,
+                                      queue_runner_required=True, num_classes=1)
+
+  def num_examples_per_epoch(self, subset='train'):
+    del subset
+    return 1
+
+  def get_input_preprocessor(self, input_preprocessor='default'):
+    return preprocessing.TestImagePreprocessor
+
+  def use_synthetic_gpu_inputs(self):
+    return False
diff --git a/cv/classification/resnet50/tensorflow/tf_cnn_benchmarks.py b/cv/classification/resnet50/tensorflow/tf_cnn_benchmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..3014ed7a15a9776572be49a7f5cb5b794504914f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/tf_cnn_benchmarks.py
@@ -0,0 +1,80 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Benchmark script for TensorFlow.
+
+See the README for more information.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags as absl_flags
+import tensorflow.compat.v1 as tf
+import time
+
+import benchmark_cnn
+import cnn_util
+import flags
+import mlperf
+from cnn_util import log_fn
+
+
+flags.define_flags()
+for name in flags.param_specs.keys():
+  absl_flags.declare_key_flag(name)
+
+absl_flags.DEFINE_boolean(
+    'ml_perf_compliance_logging', False,
+    'Print logs required to be compliant with MLPerf. If set, must clone the '
+    'MLPerf training repo https://github.com/mlperf/training and add '
+    'https://github.com/mlperf/training/tree/master/compliance to the '
+    'PYTHONPATH')
+
+
+def main(positional_arguments):
+  # Command-line arguments like '--distortions False' are equivalent to
+  # '--distortions=True False', where False is a positional argument. To prevent
+  # this from silently running with distortions, we do not allow positional
+  # arguments.
+  assert len(positional_arguments) >= 1
+  if len(positional_arguments) > 1:
+    raise ValueError('Received unknown positional arguments: %s'
+                     % positional_arguments[1:])
+
+  params = benchmark_cnn.make_params_from_flags()
+  try:
+      from dltest import show_training_arguments
+      show_training_arguments(flags.FLAGS)
+  except:
+      pass
+  with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging,
+                            params.model):
+    params = benchmark_cnn.setup(params)
+    bench = benchmark_cnn.BenchmarkCNN(params)
+
+    tfversion = cnn_util.tensorflow_version_tuple()
+    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
+
+    bench.print_info()
+    bench.run()
+
+
+if __name__ == '__main__':
+  time.sleep(5)
+  tf.disable_v2_behavior()
+  app.run(main)  # Raises error on invalid flags, unlike tf.app.run()
diff --git a/cv/classification/resnet50/tensorflow/variable_mgr.py b/cv/classification/resnet50/tensorflow/variable_mgr.py
new file mode 100644
index 0000000000000000000000000000000000000000..119b0278c0c0a8ac0f49811267554b3db216ef98
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/variable_mgr.py
@@ -0,0 +1,839 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines VariableMgr and subclasses used to manage variables.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import re
+
+import tensorflow.compat.v1 as tf
+
+import allreduce
+import batch_allreduce
+import variable_mgr_util
+
+
+class VariableMgr(object):
+  """Abstract superclass for class used by BenchmarkCNN to control variables.
+
+    Functions on this class are used to control how variables are created and
+    managed, and how gradients are computed and applied.
+  """
+
+  def __init__(self, benchmark_cnn):
+    self.benchmark_cnn = benchmark_cnn
+    self.staging_delta_ops = []
+    self.use_resource_vars = benchmark_cnn.params.use_resource_vars
+
+    # A variable for automatic loss scaling.
+    self.grad_has_inf_nan = None
+
+    self._reuse_vars = False
+
+  def each_tower_has_variables(self):
+    """Returns True if each GPU tower of the model has separate variables."""
+    assert False, 'Must be implemented in subclass'
+
+  def supports_staged_vars(self):
+    """Whether staged variable management is supported."""
+    return False
+
+  def create_outer_variable_scope(self, device_num):
+    """Create the tf.variable_scope around all model graph operations."""
+    del device_num  # unused by this implementation
+    assert False, 'Must be implemented in subclass'
+
+  def preprocess_device_grads(self, device_grads):
+    """Preprocess the device gradients prior to applying them.
+
+    Args:
+      device_grads: List of lists of (gradient, variable) tuples.
+        device_grads[t][g] = (gradient, variable), where t is the index of the
+        tower and g is the index of the gradient-variable pair.
+
+    Returns: a tuple of (apply_gradients_devices, gradient_state).
+      gradient_state is an opaque structure that should be passed to
+      get_gradients_to_apply() and append_apply_gradients_ops() (in that order).
+      apply_gradients_devices is a list of devices where the gradients will be
+      applied with get_gradients_to_apply() and append_apply_gradients_ops().
+    """
+    del device_grads  # unused by this implementation
+    assert False, 'Must be implemented in subclass'
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    """Returns the [(gradient, variable)] list to apply for device_num.
+
+    Args:
+      device_num: indexes into apply_gradients_devices, which was returned by an
+        earlier call to preprocess_device_grads.
+      gradient_state: from previous call to apply_gradients_devices.
+    """
+    del device_num, gradient_state  # unused by this implementation
+    assert False, 'Must be implemented in subclass'
+
+  def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops,
+                                 loss_scale_params):
+    """Adds training ops for grads to 'training_ops'.
+
+
+
+    Args:
+      gradient_state: from previous call to apply_gradients_devices.
+      opt: the underlying optimizer
+      grads: [(grad, var)] to apply
+      training_ops: list to which to add ops
+      loss_scale_params: parameters for loss scaling.
+    """
+    del gradient_state  # unused by this implementation
+
+    def get_apply_gradients_ops_func():
+      """Returns the apply_gradients op."""
+      return [opt.apply_gradients(grads)]
+
+    variable_mgr_util.append_gradients_with_loss_scale(
+        training_ops, get_apply_gradients_ops_func, loss_scale_params,
+        self.grad_has_inf_nan)
+
+  def get_post_init_ops(self):
+    """Returns ops that should run post-initialization."""
+    return []
+
+  def get_devices(self):
+    """Returns devices to use for computation; includes replica selection."""
+    assert False, 'Must be implemented in subclass'
+
+  def savable_variables(self):
+    """Returns a list/dict of savable variables to pass to tf.train.Saver."""
+    return tf.global_variables()
+
+  def trainable_variables_on_device(self,
+                                    rel_device_num,
+                                    abs_device_num,
+                                    writable=False):
+    """Return the set of trainable variables on device.
+
+    Args:
+      rel_device_num: local worker device index.
+      abs_device_num: global graph device index.
+      writable: whether to get a reference to the underlying variable.
+
+    Returns:
+      The set of trainable variables on the specified device.
+    """
+    del rel_device_num, writable
+    if self.each_tower_has_variables():
+      params = [
+          v for v in tf.trainable_variables()
+          if v.name.startswith('v%s/' % abs_device_num)
+      ]
+    else:
+      params = tf.trainable_variables()
+    return params
+
+  @contextlib.contextmanager
+  def reuse_variables(self):
+    """Context manager that causes variables requested to be reused.
+
+    Variables requested under this context manager must already exist, and will
+    be reused instead of being created again. This should be used if the
+    evaluation model is being built after the training model has already been
+    built. This is because the evaluation model should reuse variables from the
+    training model.
+
+    Yields:
+      Nothing.
+    """
+    old_reuse_vars = self._reuse_vars
+    try:
+      self._reuse_vars = True
+      yield
+    finally:
+      self._reuse_vars = old_reuse_vars
+
+
+class VariableMgrIndependent(VariableMgr):
+  """VariableMgr that implements the --independent mode for local jobs.
+
+     Each GPU has its own copy of the variables, and gradients are
+     not shared between towers. This can be used to check
+     performance when no data is moved between GPUs.
+  """
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars,
+                             use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    return (self.benchmark_cnn.devices, device_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state
+    tower_grad = device_grads[device_num]
+
+    if self.benchmark_cnn.enable_auto_loss_scale and device_num == 0:
+      # Since we don't aggregate variables in --independent mode, we cannot tell
+      # if there are NaNs on all GPUs. So we arbitrarily choose to only check
+      # NaNs on the first GPU.
+      has_inf_nan_list = []
+      for grad, _ in tower_grad:
+        has_inf_nan_list.append(tf.reduce_all(tf.is_finite(grad)))
+      self.grad_has_inf_nan = tf.logical_not(tf.reduce_all(has_inf_nan_list))
+
+    return tower_grad
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
+
+
+class VariableMgrLocalFetchFromPS(VariableMgr):
+  """VariableMgr that implements the --parameter_server mode for local jobs.
+
+     Variables are stored on a parameter server.  For each step, each tower gets
+     a copy of the variables from the parameter server, and sends its gradients
+     to the param server.
+  """
+
+  def each_tower_has_variables(self):
+    return False
+
+  def create_outer_variable_scope(self, device_num):
+    return tf.variable_scope('v', reuse=bool(device_num) or self._reuse_vars,
+                             use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    return ([self.benchmark_cnn.param_server_device], device_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    assert device_num == 0
+    device_grads = gradient_state
+    agg_grads, self.grad_has_inf_nan = (
+        variable_mgr_util.
+        aggregate_gradients_using_copy_with_variable_colocation(
+            device_grads,
+            use_mean=True,
+            check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
+    return agg_grads
+
+  def get_devices(self):
+    raw_devices = self.benchmark_cnn.raw_devices
+    if self.benchmark_cnn.local_parameter_device_flag == 'gpu':
+      return [
+          variable_mgr_util.ParamServerDeviceSetter(d, raw_devices)
+          for d in raw_devices
+      ]
+    else:
+      return [
+          tf.train.replica_device_setter(
+              worker_device=d,
+              ps_device=self.benchmark_cnn.param_server_device,
+              ps_tasks=1) for d in raw_devices
+      ]
+
+
+class VariableMgrLocalFetchFromStagedPS(VariableMgrLocalFetchFromPS):
+  """Implements fetching a local variable through staging buffers.
+  """
+
+  def __init__(self, benchmark_cnn):
+    super(VariableMgrLocalFetchFromStagedPS, self).__init__(benchmark_cnn)
+    # A data structure to track where the variables are used on each device.
+    # Indexed by device_num and var_name, each entry stores the "put" and "get"
+    # ops used for that variable on that device:
+    #   staging_vars_on_devices[device_num][var_name] == (put_op, get_op)
+    self.staging_vars_on_devices = [
+        dict() for _ in self.benchmark_cnn.raw_devices
+    ]
+
+  def supports_staged_vars(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    self._custom_getter = variable_mgr_util.StagedVariableGetter(
+        device_num, self.benchmark_cnn.raw_devices, None, self)
+    return tf.variable_scope(
+        'v', reuse=bool(device_num) or self._reuse_vars,
+        custom_getter=self._custom_getter, use_resource=self.use_resource_vars)
+
+  def trainable_variables_on_device(self,
+                                    rel_device_num,
+                                    abs_device_num,
+                                    writable=False):
+    return self._custom_getter.trainable_variables_on_device(
+        rel_device_num, abs_device_num, writable=writable)
+
+
+class VariableMgrLocalReplicated(VariableMgr):
+  """VariableMgr that implements the --replicated mode for local jobs.
+
+     Each GPU has its own copy of the variables. To apply gradients,
+     either a local all-reduce algorithm is applied or a regular
+     cross-device aggregation is used to replicate the combined
+     gradients to all towers.
+  """
+
+  def __init__(self, benchmark_cnn, all_reduce_spec,
+               agg_small_grads_max_bytes, agg_small_grads_max_group,
+               allreduce_merge_scope):
+    super(VariableMgrLocalReplicated, self).__init__(benchmark_cnn)
+    if all_reduce_spec:
+      spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
+      if len(spec) != 1:
+        raise ValueError(
+            'replicated mode does not support hybrid all-reduce strategies')
+      self._all_reduce_spec = spec[0]
+    else:
+      self._all_reduce_spec = None
+    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self._agg_small_grads_max_group = agg_small_grads_max_group
+    self._warmup_ops = []
+    self._allreduce_merge_scope = allreduce_merge_scope
+    self._gradient_put_ops = None
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars,
+                             use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    compact_grads = (self.benchmark_cnn.params.use_fp16 and
+                     self.benchmark_cnn.params.compact_gradient_transfer)
+    defer_grads = (self.benchmark_cnn.params.variable_consistency == 'relaxed')
+
+    grads_to_reduce = [[g for g, _ in grad_vars] for grad_vars in device_grads]
+    algorithm = batch_allreduce.algorithm_from_params(self.benchmark_cnn.params)
+    reduced_grads, self._warmup_ops = algorithm.batch_all_reduce(
+        grads_to_reduce, self.benchmark_cnn.params.gradient_repacking,
+        compact_grads, defer_grads, self.benchmark_cnn.params.xla_compile)
+    if self.benchmark_cnn.enable_auto_loss_scale:
+      # Check for infs or nans
+      is_finite_list = []
+      with tf.name_scope('check_for_inf_and_nan'):
+        for tower_grads in reduced_grads:
+          with tf.colocate_with(tower_grads[0]):
+            # TODO(tanmingxing): Create fused op that takes in a list of tensors
+            # as input and returns scalar boolean True if there are any
+            # infs/nans.
+            is_finite_list.append(tf.reduce_all(
+                [tf.reduce_all(tf.is_finite(g)) for g in tower_grads]))
+        self.grad_has_inf_nan = tf.logical_not(tf.reduce_all(is_finite_list))
+    reduced_device_grads = [[
+        (g, v) for g, (_, v) in zip(grads, grad_vars)
+    ] for grads, grad_vars in zip(reduced_grads, device_grads)]
+    return self.benchmark_cnn.devices, reduced_device_grads
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state
+    return device_grads[device_num]
+
+  def get_post_init_ops(self):
+    # Copy initialized values for variables on GPU 0 to other GPUs.
+    global_vars = tf.global_variables()
+    var_by_name = dict([(v.name, v) for v in global_vars])
+    post_init_ops = []
+    for v in global_vars:
+      split_name = v.name.split('/')
+      # TODO(b/62630508): use more specific prefix than v or v0.
+      if split_name[0] == 'v0' or not v.name.startswith('v'):
+        continue
+      split_name[0] = 'v0'
+      copy_from = var_by_name['/'.join(split_name)]
+      post_init_ops.append(v.assign(copy_from.read_value()))
+    post_init_ops += self._warmup_ops
+    return post_init_ops
+
+  def savable_variables(self):
+    """Return the set of variables used for saving/loading the model."""
+    params = []
+    for v in tf.global_variables():
+      split_name = v.name.split('/')
+      if split_name[0] == 'v0' or not v.name.startswith('v'):
+        params.append(v)
+    return params
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
+
+
+class VariableMgrDistributedAllReduce(VariableMgr):
+  """VariableMgr that implements the --distributed_all_reduce mode.
+
+     Each GPU has its own copy of the variables. To apply gradients,
+     the specified all-reduce algorithm is used to reduce the gradients
+     and replicate the final value to all GPUs.
+  """
+
+  def __init__(self, benchmark_cnn, all_reduce_spec, job_name,
+               num_workers, agg_small_grads_max_bytes,
+               agg_small_grads_max_group, allreduce_merge_scope):
+    super(VariableMgrDistributedAllReduce, self).__init__(benchmark_cnn)
+    if not all_reduce_spec:
+      raise ValueError(
+          'distributed_all_reduce requires a non-empty all_reduce_spec')
+    self._all_reduce_spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
+    self._all_reduce_device_prefixes = (
+        allreduce.build_all_reduce_device_prefixes(job_name, num_workers))
+    self._num_workers = num_workers
+    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self._agg_small_grads_max_group = agg_small_grads_max_group
+    self._allreduce_merge_scope = allreduce_merge_scope
+    if not self._all_reduce_spec:
+      raise ValueError('all_reduce_spec must be specified')
+    self._single_session = True
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    """Create a scope for the named device.
+
+    Args:
+      device_num: index of device for variable scope. (Note that
+        device_num spans all processes in cluster since a single global
+        graph is used.)
+
+    Returns:
+      the requested variable_scope
+    """
+    return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars,
+                             use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    remaining_grads = device_grads
+    aggregated_grads = []
+    for spec_tuple in self._all_reduce_spec:
+      if spec_tuple.limit < 0:
+        this_grads = remaining_grads
+        remaining_grads = []
+      else:
+        (this_grads, remaining_grads) = allreduce.split_grads_by_size(
+            spec_tuple.limit, remaining_grads)
+      if this_grads:
+        range_agg_grads = allreduce.sum_gradients_all_reduce(
+            self._single_session,
+            self._all_reduce_device_prefixes,
+            this_grads,
+            self._num_workers,
+            spec_tuple.alg,
+            spec_tuple.shards,
+            self.benchmark_cnn.gpu_indices,
+            agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
+            agg_small_grads_max_group=self._agg_small_grads_max_group,
+            allreduce_merge_scope=self._allreduce_merge_scope)
+        if not aggregated_grads:
+          aggregated_grads = range_agg_grads
+        else:
+          assert len(aggregated_grads) == len(range_agg_grads)
+          for i in range(len(aggregated_grads)):
+            aggregated_grads[i] += range_agg_grads[i]
+    assert not remaining_grads
+    full_device_set = []
+    for grads in device_grads:
+      g, v = grads[0]
+      del v
+      full_device_set.append(g.device)
+    return (full_device_set, aggregated_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state
+    if device_num >= len(device_grads):
+      raise ValueError('device_num %d exceeds length of device_grads (%d)' %
+                       (device_num, len(device_grads)))
+    return device_grads[device_num]
+
+  def get_post_init_ops(self):
+    """Copy initialized values for variables to other devices."""
+    global_vars = tf.global_variables()
+    var_by_name = dict([(v.name, v) for v in global_vars])
+    post_init_ops = []
+    for v in global_vars:
+      split_name = v.name.split('/')
+      # TODO(b/62630508): use more specific prefix than v or v0.
+      if split_name[0] == 'v0' or not v.name.startswith('v'):
+        continue
+      split_name[0] = 'v0'
+      copy_from = var_by_name['/'.join(split_name)]
+      post_init_ops.append(v.assign(copy_from.read_value()))
+    return post_init_ops
+
+  def savable_variables(self):
+    """Return the set of variables used for saving/loading the model."""
+    params = []
+    for v in tf.global_variables():
+      split_name = v.name.split('/')
+      if split_name[0] == 'v0' or not v.name.startswith('v'):
+        params.append(v)
+    return params
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
+
+
+# TODO(tucker): Merge this mode with DistributedAllReduce.
+class VariableMgrCollectiveAllReduce(VariableMgr):
+  """VariableMgr that implements the --collective_all_reduce mode.
+
+     Each GPU has its own copy of the variables. To apply gradients
+     the TF native collective all-reduce op is used to reduce the gradients
+     and replicate the final value to all GPUs.
+  """
+
+  def __init__(self, benchmark_cnn, all_reduce_spec,
+               num_workers, num_gpus, task_id, allreduce_merge_scope):
+    super(VariableMgrCollectiveAllReduce, self).__init__(benchmark_cnn)
+    if not all_reduce_spec:
+      raise ValueError(
+          'collective_all_reduce requires a non-empty all_reduce_spec: %s'
+          % all_reduce_spec)
+    parsed_spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
+    # So far we only support a length-1 all_reduce_spec
+    if len(parsed_spec) > 1 or parsed_spec[0].limit > 0:
+      raise ValueError(
+          'collective_all_reduce requires one single-range all_reduce_spec %s'
+          % parsed_spec)
+    self._all_reduce_spec = parsed_spec[0]
+    if self._all_reduce_spec.alg != 'collective':
+      raise ValueError(
+          'VariableMgrCollectiveAllReduce initialized with non-collective '
+          'all_reduce_spec %s' % self.all_reduce_spec)
+    self._num_workers = num_workers
+    self._num_gpus = num_gpus
+    self._task_id = task_id
+    self._allreduce_merge_scope = allreduce_merge_scope
+    self._instance_key_counter = 10000
+    self._instance_key_table = dict()
+    self._single_session = False
+    # List of prefixes for generating PS devices, unused here.
+    self._all_reduce_device_prefixes = None
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    """Create a scope for the named device.
+
+    Args:
+      device_num: index of device for variable scope.
+
+    Returns:
+      the requested variable_scope
+    """
+    return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    reduced_grads = allreduce.sum_gradients_all_reduce(
+        self._single_session,
+        self._all_reduce_device_prefixes,
+        device_grads,
+        self._num_workers,
+        'collective',
+        self._all_reduce_spec.shards,
+        self.benchmark_cnn.gpu_indices,
+        allreduce_merge_scope=self._allreduce_merge_scope)
+    assert len(reduced_grads) == len(device_grads)
+    full_device_set = []
+    for grads in device_grads:
+      g, _ = grads[0]
+      full_device_set.append(g.device)
+    return (full_device_set, reduced_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state
+    if device_num >= len(device_grads):
+      raise ValueError('device_num %d exceeds length of device_grads (%d)' %
+                       (device_num, len(device_grads)))
+    return device_grads[device_num]
+
+  def _get_instance_key(self, name):
+    if name not in self._instance_key_table.keys():
+      self._instance_key_counter += 1
+      self._instance_key_table[name] = self._instance_key_counter
+    return self._instance_key_table[name]
+
+  def get_post_init_ops(self):
+    """Broadcast initialized values of variables to other devices.
+
+    Returns:
+      At task 0 device 0, broadcast_send.
+      At all other devices and tasks, broadcast_recv.
+    """
+    global_vars = tf.global_variables()
+    group_size = self._num_workers * self._num_gpus
+    post_init_ops = []
+    # Gather variables into same-var-different-device groups.
+    vars_by_suffix = dict()
+    for v in global_vars:
+      split_name = v.name.split('/')
+      mo = re.match(r'v(\d+)$', split_name[0])
+      if mo:
+        device_id = int(mo.group(1))
+        suffix = '/'.join(split_name[1:])
+        if suffix in vars_by_suffix.keys():
+          vars_by_suffix[suffix].append(v)
+        else:
+          vars_by_suffix[suffix] = [v]
+    # Generate broadcast ops for each such group.
+    for suffix in sorted(vars_by_suffix):
+      vlist = vars_by_suffix[suffix]
+      assert self._num_gpus == len(vlist)
+      devices = [v.device for v in vlist]
+      # NOTE: this key should generate the same value for all tasks
+      group_key = allreduce.collective_group_key(devices)
+      group_size = self._num_workers * len(devices)
+      instance_key = self._get_instance_key(suffix)
+      for v in vlist:
+        split_name = v.name.split('/')
+        mo = re.match(r'v(\d+)$', split_name[0])
+        if mo:
+          device_id = int(mo.group(1))
+          if (self._task_id == 0 and device_id == 0):
+            with tf.device(v.device):
+              bcast_send = allreduce.broadcast_send(
+                  v, v.shape, v.dtype, group_size, group_key, instance_key)
+              post_init_ops.append(v.assign(bcast_send))
+          else:
+            with tf.device(v.device):
+              bcast_recv = allreduce.broadcast_recv(
+                  v.shape, v.dtype, group_size, group_key, instance_key)
+              post_init_ops.append(v.assign(bcast_recv))
+    return post_init_ops
+
+  def savable_variables(self):
+    """Return the set of variables used for saving/loading the model."""
+    params = []
+    if self._task_id == 0:
+      for v in tf.global_variables():
+        split_name = v.name.split('/')
+        if split_name[0] == 'v0' or not v.name.startswith('v'):
+          params.append(v)
+    return params
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
+
+
+class VariableMgrDistributedFetchFromPS(VariableMgr):
+  """Implements --variable_update=parameter_server mode for distributed jobs.
+
+     Variables are stored on a parameter server.  For each step, each tower gets
+     a copy of the variables from the parameter server, and sends its gradients
+     to the param server.
+  """
+
+  def each_tower_has_variables(self):
+    return False
+
+  def create_outer_variable_scope(self, device_num):
+    if self.benchmark_cnn.local_parameter_device_flag == 'gpu':
+      caching_devices = self.benchmark_cnn.raw_devices
+    else:
+      caching_devices = [self.benchmark_cnn.cpu_device]
+    custom_getter = variable_mgr_util.OverrideCachingDevice(
+        caching_devices, self.benchmark_cnn.cpu_device, 1024 * 64)
+    return tf.variable_scope(
+        'v', reuse=bool(device_num) or self._reuse_vars,
+        custom_getter=custom_getter, use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    # Returns (gradient_devices, gradient_state)
+    return ([self.benchmark_cnn.param_server_device], device_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    assert device_num == 0
+    agg_grads, self.grad_has_inf_nan = (
+        variable_mgr_util.aggregate_gradients_using_copy(
+            gradient_state,
+            use_mean=True,
+            check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
+    return agg_grads
+
+  def get_devices(self):
+    ps_strategy = variable_mgr_util.GreedyLoadBalancingStrategy(
+        self.benchmark_cnn.num_ps, variable_mgr_util.byte_size_load_fn)
+    return [
+        tf.train.replica_device_setter(
+            worker_device=d,
+            cluster=self.benchmark_cnn.cluster_manager.get_cluster_spec(),
+            ps_strategy=ps_strategy) for d in self.benchmark_cnn.raw_devices
+    ]
+
+
+class VariableMgrDistributedFetchFromStagedPS(
+    VariableMgrDistributedFetchFromPS):
+  """Extends VariableMgrDistributedFetchFromPS for --staged_vars."""
+
+  def __init__(self, benchmark_cnn):
+    super(VariableMgrDistributedFetchFromStagedPS, self).__init__(benchmark_cnn)
+    self.staging_vars_on_devices = [
+        dict() for _ in self.benchmark_cnn.raw_devices
+    ]
+    self.staged_vars_on_cpu = {}
+
+  def create_outer_variable_scope(self, device_num):
+    self._custom_getter = variable_mgr_util.StagedVariableGetter(
+        device_num, self.benchmark_cnn.raw_devices,
+        self.benchmark_cnn.cpu_device, self)
+    return tf.variable_scope(
+        'v', reuse=bool(device_num) or self._reuse_vars,
+        custom_getter=self._custom_getter, use_resource=self.use_resource_vars)
+
+  def supports_staged_vars(self):
+    return True
+
+  def trainable_variables_on_device(self,
+                                    rel_device_num,
+                                    abs_device_num,
+                                    writable=False):
+    return self._custom_getter.trainable_variables_on_device(
+        rel_device_num, abs_device_num, writable=writable)
+
+
+class VariableMgrDistributedReplicated(VariableMgr):
+  """VariableMgr that implements the --distributed_replicated mode.
+
+     Each GPU has a copy of the variables, and updates its copy after the
+     parameter servers are all updated with the gradients from all servers. Only
+     works with cross_replica_sync=true. Unlike 'replicated', does not use nccl
+     all-reduce for replicating within a server.
+  """
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    return tf.variable_scope(
+        'v%s' % device_num, reuse=self._reuse_vars,
+        custom_getter=variable_mgr_util.OverrideToLocalVariableIfNotPsVar(),
+        use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    return ([self.benchmark_cnn.param_server_device], device_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state  # From 2nd result of preprocess_device_grads.
+
+    avg_grads, self.grad_has_inf_nan = (
+        variable_mgr_util.aggregate_gradients_using_copy_with_device_selection(
+            self.benchmark_cnn,
+            device_grads,
+            use_mean=True,
+            check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
+
+    # Make shadow variable on a parameter server for each original trainable
+    # variable.
+    for i, (g, v) in enumerate(avg_grads):
+      my_name = variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/' + v.name
+      if my_name.endswith(':0'):
+        my_name = my_name[:-2]
+      new_v = tf.get_variable(
+          my_name,
+          dtype=v.dtype.base_dtype,
+          initializer=v.initial_value,
+          trainable=True)
+      avg_grads[i] = (g, new_v)
+    return avg_grads
+
+  def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops,
+                                 loss_scale_params):
+    device_grads = gradient_state  # From 2nd result of preprocess_device_grads.
+
+    def get_apply_gradients_ops_func():
+      """Returns a list of ops for updating gradients."""
+      apply_gradients_ops = []
+      # For each variable, apply the combined gradients for this server on
+      # the parameter server, and then wait for all other servers to do this.
+      for i, (g, v) in enumerate(grads):
+        apply_gradient_op = opt.apply_gradients([(g, v)])
+        barrier = self.benchmark_cnn.add_sync_queues_and_barrier(
+            'replicate_variable_%s' % i, [apply_gradient_op])
+        with tf.control_dependencies([barrier]):
+          with tf.device(self.benchmark_cnn.cpu_device):
+            updated_value = v.read_value()
+            for my_d in range(len(self.benchmark_cnn.devices)):
+              apply_gradients_ops.append(
+                  device_grads[my_d][i][1].assign(updated_value))
+      return apply_gradients_ops
+
+    variable_mgr_util.append_gradients_with_loss_scale(
+        training_ops, get_apply_gradients_ops_func, loss_scale_params,
+        self.grad_has_inf_nan)
+
+  def _strip_port(self, s):
+    if s.endswith(':0'):
+      return s[:-2]
+    return s
+
+  def get_post_init_ops(self):
+    # Copy initialized variables for variables on the parameter server
+    # to the local copy of the variable.
+
+    local_vars = tf.local_variables()
+    local_var_by_name = dict(
+        [(self._strip_port(v.name), v) for v in local_vars])
+    post_init_ops = []
+    for v in tf.global_variables():
+      if v.name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0/'):
+        prefix = self._strip_port(
+            v.name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'):])
+        for i in range(self.benchmark_cnn.num_gpus):
+          name = 'v%s%s' % (i, prefix)
+          if name in local_var_by_name:
+            copy_to = local_var_by_name[name]
+            post_init_ops.append(copy_to.assign(v.read_value()))
+    return post_init_ops
+
+  def _remove_shadow_var_prefix_if_present(self, var_name):
+    if var_name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):
+      return var_name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):]
+    else:
+      return var_name
+
+  def var_dict_name(self, v):
+    return self._strip_port(self._remove_shadow_var_prefix_if_present(v.name))
+
+  def savable_variables(self):
+    """Returns a list/dict of savable variables to pass to tf.train.Saver."""
+    params = {}
+    for v in tf.global_variables():
+      assert (v.name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0/')
+              or v.name in ('global_step:0', 'loss_scale:0',
+                            'loss_scale_normal_steps:0')), (
+                                'Invalid global variable: %s' % v)
+      # We store variables in the checkpoint with the shadow variable prefix
+      # removed so we can evaluate checkpoints in non-distributed replicated
+      # mode. The checkpoints can also be loaded for training in
+      # distributed_replicated mode.
+      name = self._strip_port(self._remove_shadow_var_prefix_if_present(v.name))
+      params[name] = v
+    for v in tf.local_variables():
+      # Non-trainable variables, such as batch norm moving averages, do not have
+      # corresponding global shadow variables, so we add them here. Trainable
+      # local variables have corresponding global shadow variables, which were
+      # added in the global variable loop above.
+      if v.name.startswith('v0/') and v not in tf.trainable_variables():
+        params[self._strip_port(v.name)] = v
+    return params
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
diff --git a/cv/classification/resnet50/tensorflow/variable_mgr_util.py b/cv/classification/resnet50/tensorflow/variable_mgr_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ce3e4b7c48d49797802f3dfadbaf0d4108d902
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/variable_mgr_util.py
@@ -0,0 +1,676 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for VariableMgr."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as pycoll
+import operator
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+
+
+PS_SHADOW_VAR_PREFIX = 'ps_var'
+
+AutoLossScaleParams = pycoll.namedtuple(
+    'AutoLossScaleParams',
+    [
+        # If true, enable automatic loss scaling.
+        'enable_auto_loss_scale',
+        # The value to scale the loss before computing gradients.
+        'loss_scale',
+        # Number of normal steps with the current `loss_scale`.
+        'loss_scale_normal_steps',
+        # Increase loss scale every n steps.
+        'inc_loss_scale_every_n',
+        # If true, the current worker is chief. The current implementation
+        # relies on the chief to update loss_scale value, but in future, we
+        # might change this to ask the parameter server to update loss_scales
+        # for better performance.
+        # TODO(tanmingxing): remove this if loss_scale is updated in ps.
+        'is_chief',
+    ])
+
+
+def get_loss_scale_update_op(loss_scale, loss_scale_normal_steps,
+                             inc_loss_scale_every_n):
+  """Returns the update op for loss scaling variables.
+
+  We maintain the counter `loss_scale_normal_steps` to count the number of steps
+  we have been using the current `loss_scale`. In most cases, this function
+  increments `loss_scale_normal_steps`. However, if `loss_scale_normal_steps` is
+  greater than the threshold `inc_loss_scale_every_n`, we double `loss_scale`
+  and reset `loss_scale_normal_steps` to zero.
+
+  This op is only called if the gradients don't have any infs or nans. Instead,
+  if infs or nans occur in the gradients, we immeditately halve `loss_scale` and
+  reset `loss_scale_normal_steps` to zero.
+
+  Args:
+    loss_scale: a tf.Variable represneting the loss_scale value.
+    loss_scale_normal_steps: a tf.Variable representing the number of training
+      steps that have run since the loss_scale last changed.
+    inc_loss_scale_every_n: a Python integer threshold. `loss_scale` is
+      increased every `inc_loss_scale_every_n` steps, unless the gradients have
+      infs or nans.
+
+  Returns:
+    An op for updating `loss_scale` and `loss_scale_normal_steps`.
+  """
+
+  def increment_loss_scale_normal_steps_func():
+    return tf.group(loss_scale_normal_steps.assign_add(1))
+
+  def increase_loss_scale_func():
+    return tf.group(
+        tf.assign(loss_scale_normal_steps, 0),
+        tf.assign(loss_scale, loss_scale * 2))
+
+  # true_fn and false_fn must have the same type.
+  return tf.cond(loss_scale_normal_steps < inc_loss_scale_every_n,
+                 increment_loss_scale_normal_steps_func,
+                 increase_loss_scale_func)
+
+
+def append_gradients_with_loss_scale(training_ops, get_apply_gradients_ops_func,
+                                     loss_scale_params, grad_has_inf_nan):
+  """Selectively appends gradients update ops with loss scaling.
+
+  Args:
+    training_ops: a list of training ops to be executed.
+    get_apply_gradients_ops_func: a function that returns a list of ops for
+      applying gradients. Here, we must pass a function instead of the actual
+      list of ops; otherwise, those ops would be executed unconditionally due to
+      the semantics of tf.cond.
+    loss_scale_params: An AutoLossScaleParams tuple.
+    grad_has_inf_nan: Boolean tensor indicating whether the gradients have infs
+      or nans.
+  """
+  is_chief = loss_scale_params.is_chief
+  loss_scale = loss_scale_params.loss_scale
+  loss_scale_normal_steps = loss_scale_params.loss_scale_normal_steps
+  inc_loss_scale_every_n = loss_scale_params.inc_loss_scale_every_n
+  enable_auto_loss_scale = loss_scale_params.enable_auto_loss_scale
+
+  if loss_scale is None or not enable_auto_loss_scale or not is_chief:
+    training_ops.extend(get_apply_gradients_ops_func())
+  else:
+    # If nans/infs occurred, skip applying gradients and instead update
+    # loss_scale (halve loss_scale and reset loss_scale_normal_steps to zero).
+    def update_op_if_nan_or_inf():
+      """Update loss_scale and discard gradients if nans/infs occurred."""
+      return tf.group(
+          tf.assign(loss_scale, loss_scale / 2.),
+          tf.assign(loss_scale_normal_steps, 0))
+
+    # Otherwise, apply gradients, and update loss_scale and
+    # loss_scale_normal_steps.
+    def update_op_if_no_nan_or_inf():
+      """Apply gradients, and update loss scaling."""
+      return tf.group(
+          get_loss_scale_update_op(loss_scale, loss_scale_normal_steps,
+                                   inc_loss_scale_every_n),
+          *get_apply_gradients_ops_func())
+
+    # TODO(tanmingxing): Add support for independent and distributed all_reduce.
+    assert grad_has_inf_nan is not None
+    update_op = tf.cond(
+        grad_has_inf_nan,
+        update_op_if_nan_or_inf,
+        update_op_if_no_nan_or_inf,
+        name='cond_if_grad_has_inf_nan'
+    )
+    training_ops.append(update_op)
+
+
+# To be used with custom_getter on tf.get_variable.
+class OverrideCachingDevice(object):
+  """Variable getter which caches variables on the least loaded device.
+
+  Variables smaller than a certain threshold are cached on a single specific
+  device, as specified in the constructor. All other variables are load balanced
+  across a pool of devices, by caching each variable on the least loaded device.
+
+  Note that variable creation only happen when building the model graph on the
+  first device (see how it sets the 'reuse' parameter in
+  VariableMgr.*.create_outer_variable_scope()). That means, for all other
+  devices, the variable scope will reuse the variables created before, which
+  requires that we set the caching_device correctly as otherwise it may not be
+  able to find the previously created variable and will create a new one. This
+  requires when building the model graph on different devices, variables with
+  the same name should have same size.
+
+  TODO(laigd): consider adding tests or verification logic to enforce this, or
+  refactor it.
+  """
+
+  def __init__(self, devices, device_for_small_variables,
+               small_variable_size_threshold):
+    self.devices = devices
+    self.sizes = [0] * len(self.devices)
+    self.device_for_small_variables = device_for_small_variables
+    self.small_variable_size_threshold = small_variable_size_threshold
+
+  def __call__(self, getter, *args, **kwargs):
+    size = tf.TensorShape(kwargs['shape']).num_elements()
+    if size < self.small_variable_size_threshold:
+      device_name = self.device_for_small_variables
+    else:
+      device_index, _ = min(enumerate(self.sizes), key=operator.itemgetter(1))
+      device_name = self.devices[device_index]
+      self.sizes[device_index] += size
+
+    kwargs['caching_device'] = device_name
+    var = getter(*args, **kwargs)
+    return var
+
+
+# To be used with custom_getter on tf.get_variable. Ensures the created variable
+# is in LOCAL_VARIABLES and not GLOBAL_VARIBLES collection.
+class OverrideToLocalVariableIfNotPsVar(object):
+
+  # args and kwargs come from the custom_getter interface for Tensorflow
+  # variables, and matches tf.get_variable's signature, with the addition of
+  # 'getter' at the beginning.
+  def __call__(self, getter, name, *args, **kwargs):
+    if name.startswith(PS_SHADOW_VAR_PREFIX):
+      return getter(*args, **kwargs)
+
+    if 'collections' in kwargs:
+      collections = kwargs['collections']
+    if not collections:
+      collections = [tf.GraphKeys.GLOBAL_VARIABLES]
+    else:
+      collections = collections[:]
+    collections.remove(tf.GraphKeys.GLOBAL_VARIABLES)
+    collections.append(tf.GraphKeys.LOCAL_VARIABLES)
+    kwargs['collections'] = list(collections)
+    return getter(name, *args, **kwargs)
+
+
+class ParamServerDeviceSetter(object):
+  """Helper class to assign variables on the least loaded ps-device."""
+
+  def __init__(self, worker_device, ps_devices):
+    """Initializer for ParamServerDevicSetter.
+
+    Args:
+      worker_device: the device to use for computer ops.
+      ps_devices: a list of device to use for Variable ops. Each variable is
+      assigned to the least loaded device.
+    """
+    self.ps_devices = ps_devices
+    self.worker_device = worker_device
+    self.ps_sizes = [0] * len(self.ps_devices)
+
+  def __call__(self, op):
+    if op.device:
+      return op.device
+    if op.type not in ['Variable', 'VariableV2']:
+      return self.worker_device
+
+    device_index, _ = min(enumerate(self.ps_sizes), key=operator.itemgetter(1))
+    device_name = self.ps_devices[device_index]
+    var_size = op.outputs[0].get_shape().num_elements()
+    self.ps_sizes[device_index] += var_size
+
+    return device_name
+
+
+class StagedModelVariable(object):
+  """Staging variable wrapper that decouples reads and updates.
+
+  This class represents a variable through a staging buffer. Reads from this
+  variable directly gets from the staging buffer. Updates are stacked into
+  another staging buffer, and will be processed later.
+  """
+
+  def __init__(self, real_var, var_stage_get, variable_mgr):
+    """Initializer for the model variables through a staging buffer.
+
+    Args:
+      real_var: the underlying real variable.
+      var_stage_get: the read op from the staging buffer.
+      variable_mgr: the parent variable-manager.
+    """
+    self.real_var = real_var
+    self.var_stage_get = var_stage_get
+    self.variable_mgr = variable_mgr
+
+  def _value(self):
+    """The read access of this variable. The content from the staging buffer."""
+    return self.var_stage_get
+
+  def _ref(self):
+    """Return the underlying variable ref, required by tf.colocate_with."""
+    return self.real_var._ref()  # pylint: disable=protected-access
+
+  def read_value(self):
+    """Mimics tf.Variable.read_value()."""
+    return tf.identity(self.var_stage_get, name='read')
+
+  @property
+  def dtype(self):
+    """Return the non-reference dtype."""
+    return self.var_stage_get.dtype
+
+  def assign_sub(self, delta, name=None, read_value=True):
+    """Mimic the updates to the variable.
+
+    Args:
+      delta: is pushed into a staging buffer and will be pumped later.
+      name: currently ignored; names of ops and the StagingArea are
+            computed without using this pass name.
+      read_value: if True, will return something which evaluates to the new
+              value of the variable; if False will return the assign op.
+    Returns:
+      The actual updates. The colocation constraint will be reapplied.
+    """
+    # This parameter is ignored: the StagingArea only supports setting
+    # the shared name, not the names of individual ops it uses.
+    del name
+
+    # colocate_with(None, True) clears the colocation constraints.
+    # Push the delta into a staging buffer.
+    with ops.colocate_with(None, True), tf.device(self.var_stage_get.device):
+      delta_staging_area = data_flow_ops.StagingArea(
+          [self.var_stage_get.dtype], shapes=[self.var_stage_get.shape])
+      delta_put_op = delta_staging_area.put([delta])
+      self.variable_mgr.staging_delta_ops.append(delta_put_op)
+      delta_get_op = delta_staging_area.get()[0]
+    # Return the actual updates. The colocation constraint will be reapplied.
+    return self.real_var.assign_sub(delta_get_op, read_value=read_value)
+
+  @staticmethod
+  # pylint: disable=bad-staticmethod-argument,invalid-name
+  def _TensorConversionFunction(self, dtype=None, name=None, as_ref=False):
+    """Utility function for converting a StagedModelVariable to a Tensor."""
+    del dtype, name  # unused: this function returns the cached ref or value.
+    if as_ref:
+      return self._ref()
+    else:
+      return self._value()
+
+
+ops.register_tensor_conversion_function(
+    StagedModelVariable, StagedModelVariable._TensorConversionFunction)  # pylint: disable=protected-access
+
+
+class StagedVariableGetter(object):
+  """A variable getter through staging buffers on devices.
+
+  Instead of a caching device, this getter tracks where the variable is used.
+  And on each device, it goes through a staging buffer.
+  """
+
+  def __init__(self, device_num, devices, cpu_device, variable_mgr):
+    """Initializer for StagedVariableGetter.
+
+    Args:
+      device_num: the current device index.
+      devices: a list of all the devices to build towers.
+      cpu_device: a cpu_device for this replica. If None, no cpu-caching is
+          done.
+      variable_mgr: the parent variable manager.
+    """
+    self.device_num = device_num
+    self.devices = devices
+    self.cpu_device = cpu_device
+    self.variable_mgr = variable_mgr
+
+  def __call__(self, getter, name, *args, **kwargs):
+    staging_ops = self.variable_mgr.staging_vars_on_devices[self.device_num]
+    if name in staging_ops:
+      put_op, get_op = staging_ops[name]
+      return get_op
+    real_var = getter(name, *args, **kwargs)
+    shape = kwargs['shape']
+    dtype = kwargs['dtype']
+    trainable = kwargs['trainable']
+    if self.cpu_device:
+      with tf.device(self.cpu_device):
+        # This helps copying the weights from the parameter to this server only
+        # once.
+        if name in self.variable_mgr.staged_vars_on_cpu:
+          cpu_var = self.variable_mgr.staged_vars_on_cpu[name]
+        else:
+          cpu_var = tf.identity(real_var)
+          self.variable_mgr.staged_vars_on_cpu[name] = cpu_var
+      var_to_stage = cpu_var
+    else:
+      var_to_stage = tf.identity(real_var)  # de-reference the variable.
+
+    with tf.device(self.devices[self.device_num]):
+      staging_area = data_flow_ops.StagingArea([dtype], shapes=[shape])
+      put_op = staging_area.put([var_to_stage])
+      get_op = staging_area.get()[0]
+      staging_ops[name] = (put_op, get_op)
+    if trainable:
+      # For trainable variables, they are managed separatedly through
+      # apply_gradients.
+      return get_op
+    else:
+      # For other shadow variables, the access is decoupled through a wrapper
+      # class.
+      return StagedModelVariable(real_var, get_op, self.variable_mgr)
+
+  def trainable_variables_on_device(self, rel_device_num, abs_device_num,
+                                    writable):
+    """Return the set of trainable variables on the specified device.
+
+    Args:
+      rel_device_num: local worker device index.
+      abs_device_num: global graph device index.
+      writable: whether the returned variables is writable or read-only.
+
+    Returns:
+      Return the set of trainable variables on the specified device.
+    """
+    del abs_device_num
+    params_refs = tf.trainable_variables()
+    if writable:
+      return params_refs
+    params = []
+    for param in params_refs:
+      var_name = param.name.split(':')[0]
+      _, var_get_op = self.variable_mgr.staging_vars_on_devices[rel_device_num][
+          var_name]
+      params.append(var_get_op)
+    return params
+
+
+def aggregate_gradients_using_copy_with_device_selection(
+    benchmark_cnn, tower_grads, use_mean, check_inf_nan):
+  """Aggregate gradients, controlling device for the aggregation.
+
+  Args:
+    benchmark_cnn: benchmark_cnn class.
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: If true, check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  if benchmark_cnn.local_parameter_device_flag == 'gpu':
+    avail_devices = benchmark_cnn.raw_devices
+  else:
+    avail_devices = [benchmark_cnn.param_server_device]
+  agg_grads = []
+  has_nan_or_inf_list = []
+  for i, single_grads in enumerate(zip(*tower_grads)):
+    with tf.device(avail_devices[i % len(avail_devices)]):
+      grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy(
+          single_grads, use_mean, check_inf_nan)
+      agg_grads.append(grad_and_var)
+      has_nan_or_inf_list.append(has_nan_or_inf)
+  if check_inf_nan:
+    return agg_grads, tf.reduce_any(has_nan_or_inf_list)
+  else:
+    return agg_grads, None
+
+
+def aggregate_gradients_using_copy_with_variable_colocation(
+    tower_grads, use_mean, check_inf_nan):
+  """Aggregate gradients, colocating computation with the gradient's variable.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients. All variables
+      of the same gradient across towers must be the same (that is,
+      tower_grads[x][a][1] == tower_grads[y][a][1] for all indices x, y, and a)
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: If true, check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  agg_grads = []
+  has_nan_or_inf_list = []
+  for single_grads in zip(*tower_grads):
+    # Note that each single_grads looks like the following:
+    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+    var = single_grads[0][1]
+
+    for _, v in single_grads:
+      assert v == var
+
+    with tf.device(var.device):
+      grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy(
+          single_grads, use_mean, check_inf_nan)
+      agg_grads.append(grad_and_var)
+      has_nan_or_inf_list.append(has_nan_or_inf)
+
+  if check_inf_nan:
+    return agg_grads, tf.reduce_any(has_nan_or_inf_list)
+  else:
+    return agg_grads, None
+
+
+def aggregate_gradients_using_copy(tower_grads, use_mean, check_inf_nan):
+  """Calculate the average gradient for each shared variable across all towers.
+
+  Note that this function provides a synchronization point across all towers.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  agg_grads = []
+  has_nan_or_inf_list = []
+
+  for single_grads in zip(*tower_grads):
+    grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy(
+        single_grads, use_mean, check_inf_nan)
+    agg_grads.append(grad_and_var)
+    has_nan_or_inf_list.append(has_nan_or_inf)
+
+  if check_inf_nan:
+    return agg_grads, tf.reduce_any(has_nan_or_inf_list)
+  else:
+    return agg_grads, None
+
+
+# The following two functions are copied from
+# tensorflow/python/eager/backprop.py. We do not directly use them as they are
+# not exported and subject to change at any time.
+def flatten_nested_indexed_slices(grad):
+  assert isinstance(grad, ops.IndexedSlices)
+  if isinstance(grad.values, ops.Tensor):
+    return grad
+  else:
+    assert isinstance(grad.values, ops.IndexedSlices)
+    g = flatten_nested_indexed_slices(grad.values)
+    return ops.IndexedSlices(g.values, array_ops.gather(grad.indices,
+                                                        g.indices),
+                             g.dense_shape)
+
+
+def aggregate_indexed_slices_gradients(grads):
+  """Aggregates gradients containing `IndexedSlices`s."""
+  if len(grads) < 1:
+    return None
+  elif len(grads) == 1:
+    return grads[0]
+  else:
+    grads = [g for g in grads if g is not None]
+    # If any gradient is a `Tensor`, sum them up and return a dense tensor
+    # object.
+    if any(isinstance(g, ops.Tensor) for g in grads):
+      return math_ops.add_n(grads)
+
+    # The following `_as_indexed_slices_list` casts ids of IndexedSlices into
+    # int64. It is to make sure the inputs of `concat` all have same the data
+    # type.
+    grads = math_ops._as_indexed_slices_list(grads)  # pylint: disable=protected-access
+
+    grads = [flatten_nested_indexed_slices(x) for x in grads]
+    # Form IndexedSlices out of the concatenated values and indices.
+    concat_grad = ops.IndexedSlices(
+        array_ops.concat([x.values for x in grads], axis=0),
+        array_ops.concat([x.indices for x in grads], axis=0),
+        grads[0].dense_shape)
+
+    return concat_grad
+
+
+def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
+                                         check_inf_nan):
+  """Calculate the average gradient for a shared variable across all towers.
+
+  Note that this function provides a synchronization point across all towers.
+
+  Args:
+    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
+      (gradient, variable) pair within the outer list represents the gradient
+      of the variable calculated for a single tower, and the number of pairs
+      equals the number of towers.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  grads = [g for g, _ in grad_and_vars]
+  if any(isinstance(g, tf.IndexedSlices) for g in grads):
+    # TODO(reedwm): All-reduce IndexedSlices more effectively.
+    grad = aggregate_indexed_slices_gradients(grads)
+  else:
+    grad = tf.add_n(grads)
+
+  if use_mean and len(grads) > 1:
+    grad = tf.scalar_mul(1.0 / len(grads), grad)
+
+  v = grad_and_vars[0][1]
+  if check_inf_nan:
+    with tf.name_scope('check_for_inf_and_nan'):
+      has_nan_or_inf = tf.logical_not(tf.reduce_all(tf.is_finite(grads)))
+    return (grad, v), has_nan_or_inf
+  else:
+    return (grad, v), None
+
+
+# This class is copied from
+# https://github.com/tensorflow/tensorflow/blob/590d6eef7e91a6a7392c8ffffb7b58f2e0c8bc6b/tensorflow/contrib/training/python/training/device_setter.py#L56.
+# We copy it since contrib has been removed from TensorFlow.
+class GreedyLoadBalancingStrategy(object):
+  """Returns the least-loaded ps task for op placement.
+
+  The load is calculated by a user-specified load function passed in at
+  construction.  There are no units for load, and the load function is
+  responsible for providing an internally consistent measure.
+
+  Note that this strategy is very sensitive to the exact order in which
+  ps ops (typically variables) are created, as it greedily places ops
+  on the least-loaded ps at the point each op is processed.
+
+  One reasonable heuristic is the `byte_size_load_fn`, which
+  estimates load as the number of bytes that would be used to store and
+  transmit the entire variable.  More advanced load functions
+  could consider the difference in access patterns across ops, or trade
+  off CPU-intensive ops with RAM-intensive ops with network bandwidth.
+
+  This class is intended to be used as a `ps_strategy` in
+  `tf.compat.v1.train.replica_device_setter`.
+  """
+
+  def __init__(self, num_tasks, load_fn):
+    """Create a new `LoadBalancingStrategy`.
+
+    Args:
+      num_tasks: Number of ps tasks to cycle among.
+      load_fn: A callable that takes an `Operation` and returns a
+        numeric load value for that op.
+    """
+    self._num_tasks = num_tasks
+    self._load_fn = load_fn
+    self._ps_loads = np.zeros(num_tasks)
+
+  def __call__(self, op):
+    """Choose a ps task index for the given `Operation`.
+
+    Args:
+      op: A `Operation` to be placed on ps.
+
+    Returns:
+      The next ps task index to use for the `Operation`. Greedily
+      places the op on the least-loaded ps task so far, as determined
+      by the load function.
+    """
+    task = np.argmin(self._ps_loads)
+    self._ps_loads[task] += self._load_fn(op)
+    return task
+
+
+# This function is copied from
+# https://github.com/tensorflow/tensorflow/blob/590d6eef7e91a6a7392c8ffffb7b58f2e0c8bc6b/tensorflow/contrib/training/python/training/device_setter.py#L105.
+# We copy it since contrib has been removed from TensorFlow.
+def byte_size_load_fn(op):
+  """Load function that computes the byte size of a single-output `Operation`.
+
+  This is intended to be used with `"Variable"` ops, which have a single
+  `Tensor` output with the contents of the variable.  However, it can also be
+  used for calculating the size of any op that has a single output.
+
+  Intended to be used with `GreedyLoadBalancingStrategy`.
+
+  Args:
+    op: An `Operation` with a single output, typically a "Variable" op.
+
+  Returns:
+    The number of bytes in the output `Tensor`.
+
+  Raises:
+    ValueError: if `op` does not have a single output, or if the shape of the
+      single output is not fully-defined.
+  """
+  if len(op.outputs) != 1:
+    raise ValueError('Op %s must have a single output' % op)
+  output = op.outputs[0]
+  elem_size = output.dtype.size
+  shape = output.get_shape()
+  if not shape.is_fully_defined():
+    # Due to legacy behavior, scalar "Variable" ops have output Tensors that
+    # have unknown shape when the op is created (and hence passed to this
+    # load function for placement), even though the scalar shape is set
+    # explicitly immediately afterward.
+    shape = tensor_shape.TensorShape(op.get_attr('shape'))
+  shape.assert_is_fully_defined()
+  return shape.num_elements() * elem_size
+
diff --git a/cv/classification/resnet50/tensorflow/variable_mgr_util_test.py b/cv/classification/resnet50/tensorflow/variable_mgr_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0915224f9681ab34daee03e01d12852b15d95298
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow/variable_mgr_util_test.py
@@ -0,0 +1,153 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for variable_mgr_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+import variable_mgr_util
+
+
+class VariableMgrUtilTest(tf.test.TestCase):
+
+  def testGetLossScaleUpdateOpTruePath(self):
+    loss_scale = tf.Variable(4)
+    # loss_scale_normal_steps >= inc_loss_scale_every_n
+    loss_scale_normal_steps = tf.Variable(10)
+    inc_loss_scale_every_n = 10
+    update_op = variable_mgr_util.get_loss_scale_update_op(
+        loss_scale, loss_scale_normal_steps, inc_loss_scale_every_n)
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(update_op)
+
+      self.assertEqual(sess.run(loss_scale), 8)
+      self.assertEqual(sess.run(loss_scale_normal_steps), 0)
+
+  def testGetLossScaleUpdateOpFalsePath(self):
+    loss_scale = tf.Variable(4)
+    # loss_scale_normal_steps < inc_loss_scale_every_n
+    loss_scale_normal_steps = tf.Variable(9)
+    inc_loss_scale_every_n = 10
+    update_op = variable_mgr_util.get_loss_scale_update_op(
+        loss_scale, loss_scale_normal_steps, inc_loss_scale_every_n)
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(update_op)
+
+      self.assertEqual(sess.run(loss_scale), 4)
+      self.assertEqual(sess.run(loss_scale_normal_steps), 10)
+
+  def testAppendGradientsWithLossScaleWithAutoScaleDisabled(self):
+    v = tf.Variable(0)
+    training_ops = []
+    get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)]
+    loss_scale_params = variable_mgr_util.AutoLossScaleParams(
+        enable_auto_loss_scale=False,  # no auto loss scale.
+        loss_scale=tf.Variable(4),
+        loss_scale_normal_steps=tf.Variable(10),
+        inc_loss_scale_every_n=10,
+        is_chief=True)
+    variable_mgr_util.append_gradients_with_loss_scale(
+        training_ops,
+        get_apply_gradients_ops_func,
+        loss_scale_params,
+        grad_has_inf_nan=True)
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(training_ops)
+      self.assertEqual(sess.run(v), 1)
+      self.assertEqual(sess.run(loss_scale_params.loss_scale), 4)
+      self.assertEqual(sess.run(loss_scale_params.loss_scale_normal_steps), 10)
+
+  def testAppendGradientsWithLossScaleForNonChiefWorker(self):
+    v = tf.Variable(0)
+    training_ops = []
+    get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)]
+    loss_scale_params = variable_mgr_util.AutoLossScaleParams(
+        enable_auto_loss_scale=True,
+        loss_scale=tf.Variable(4),
+        loss_scale_normal_steps=tf.Variable(10),
+        inc_loss_scale_every_n=10,
+        is_chief=False)  # Non-chief
+    variable_mgr_util.append_gradients_with_loss_scale(
+        training_ops,
+        get_apply_gradients_ops_func,
+        loss_scale_params,
+        grad_has_inf_nan=False)
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(training_ops)
+      self.assertEqual(sess.run(v), 1)
+      self.assertEqual(sess.run(loss_scale_params.loss_scale), 4)
+      self.assertEqual(sess.run(loss_scale_params.loss_scale_normal_steps), 10)
+
+  def testAppendGradientsWithLossScaleWithoutNan(self):
+    v = tf.Variable(0)
+    training_ops = []
+    get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)]
+    loss_scale_params = variable_mgr_util.AutoLossScaleParams(
+        enable_auto_loss_scale=True,
+        loss_scale=tf.Variable(4, dtype=tf.float32),
+        loss_scale_normal_steps=tf.Variable(10),
+        inc_loss_scale_every_n=10,
+        is_chief=True)
+    variable_mgr_util.append_gradients_with_loss_scale(
+        training_ops,
+        get_apply_gradients_ops_func,
+        loss_scale_params,
+        grad_has_inf_nan=tf.constant(False))
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(training_ops)
+      self.assertEqual(sess.run(v), 1)
+      self.assertEqual(sess.run(loss_scale_params.loss_scale), 8)
+      self.assertEqual(sess.run(loss_scale_params.loss_scale_normal_steps), 0)
+
+  def testAppendGradientsWithLossScaleWithtNan(self):
+    v = tf.Variable(0)
+    training_ops = []
+    get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)]
+    loss_scale_params = variable_mgr_util.AutoLossScaleParams(
+        enable_auto_loss_scale=True,
+        loss_scale=tf.Variable(4, dtype=tf.float32),
+        loss_scale_normal_steps=tf.Variable(10),
+        inc_loss_scale_every_n=10,
+        is_chief=True)
+    variable_mgr_util.append_gradients_with_loss_scale(
+        training_ops,
+        get_apply_gradients_ops_func,
+        loss_scale_params,
+        grad_has_inf_nan=tf.constant(True))
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(training_ops)
+      self.assertEqual(sess.run(v), 0)  # Skip updating for v.
+      # halve loss_scale and reset local_scale_normal_steps.
+      self.assertEqual(sess.run(loss_scale_params.loss_scale), 2)
+      self.assertEqual(sess.run(loss_scale_params.loss_scale_normal_steps), 0)
+
+
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  tf.test.main()