diff --git a/cv/detection/ssd/tensorflow/LICENSE b/cv/detection/ssd/tensorflow/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/cv/detection/ssd/tensorflow/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/cv/detection/ssd/tensorflow/README.md b/cv/detection/ssd/tensorflow/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0a130af1089b2aad31a20bdcb1143fa22136f65f --- /dev/null +++ b/cv/detection/ssd/tensorflow/README.md @@ -0,0 +1,42 @@ +### Download the VOC dataset +``` +cd dataset +``` +Download[ Pascal VOC Dataset](https://pjreddie.com/projects/pascal-voc-dataset-mirror/) and reorganize the directory as follows: +``` +VOCROOT/ + |->VOC2007/ + | |->Annotations/ + | |->ImageSets/ + | |->... + |->VOC2012/ # use it + | |->Annotations/ + | |->ImageSets/ + | |->... + |->VOC2007TEST/ + | |->Annotations/ + | |->... +``` +VOCROOT is your path of the Pascal VOC Dataset. +``` +mkdir tfrecords +pip3 install tf_slim +python3 convert_voc_sample_tfrecords.py --dataset_directory=./ --output_directory=tfrecords --train_splits VOC2012_sample --validation_splits VOC2012_sample + +cd .. +``` +### Download the checkpoint +Download the pre-trained VGG-16 model (reduced-fc) from [here](https://drive.google.com/drive/folders/184srhbt8_uvLKeWW_Yo8Mc5wTyc0lJT7) and put them into one sub-directory named 'model' (we support SaverDef.V2 by default, the V1 version is also available for sake of compatibility). + +### Train +#### multi gpus +``` +python3 train_ssd.py --batch_size 16 +```` + + +## Result + +| | acc | fps | +| --- | --- | --- | +| multi_card | 0.783513 | 3.177 | \ No newline at end of file diff --git a/cv/detection/ssd/tensorflow/dataset/convert_tfrecords.py b/cv/detection/ssd/tensorflow/dataset/convert_tfrecords.py new file mode 100644 index 0000000000000000000000000000000000000000..9e7b38fe151070b067c194b59a51b085bf85a23d --- /dev/null +++ b/cv/detection/ssd/tensorflow/dataset/convert_tfrecords.py @@ -0,0 +1,395 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from datetime import datetime +import os +import random +import sys +import threading +import xml.etree.ElementTree as xml_tree + +import numpy as np +import six + +import tensorflow.compat.v1 as tf + +import dataset_common + +'''How to organize your dataset folder: + VOCROOT/ + |->VOC2007/ + | |->Annotations/ + | |->ImageSets/ + | |->... + |->VOC2012/ + | |->Annotations/ + | |->ImageSets/ + | |->... + |->VOC2007TEST/ + | |->Annotations/ + | |->... +''' +tf.app.flags.DEFINE_string('dataset_directory', '/media/rs/7A0EE8880EE83EAF/Detections/PASCAL/VOC', + 'All datas directory') +tf.app.flags.DEFINE_string('train_splits', 'VOC2007, VOC2012', + 'Comma-separated list of the training data sub-directory') +tf.app.flags.DEFINE_string('validation_splits', 'VOC2007TEST', + 'Comma-separated list of the validation data sub-directory') +tf.app.flags.DEFINE_string('output_directory', '/media/rs/7A0EE8880EE83EAF/Detections/SSD/dataset/tfrecords', + 'Output data directory') +tf.app.flags.DEFINE_integer('train_shards', 16, + 'Number of shards in training TFRecord files.') +tf.app.flags.DEFINE_integer('validation_shards', 16, + 'Number of shards in validation TFRecord files.') +tf.app.flags.DEFINE_integer('num_threads', 8, + 'Number of threads to preprocess the images.') +RANDOM_SEED = 180428 + +FLAGS = tf.app.flags.FLAGS + +def _int64_feature(value): + """Wrapper for inserting int64 features into Example proto.""" + if not isinstance(value, list): + value = [value] + return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + + +def _float_feature(value): + """Wrapper for inserting float features into Example proto.""" + if not isinstance(value, list): + value = [value] + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + +def _bytes_list_feature(value): + """Wrapper for inserting a list of bytes features into Example proto. + """ + if not isinstance(value, list): + value = [value] + return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) + +def _bytes_feature(value): + """Wrapper for inserting bytes features into Example proto.""" + if isinstance(value, six.string_types): + value = six.binary_type(value, encoding='utf-8') + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + +def _convert_to_example(filename, image_name, image_buffer, bboxes, labels, labels_text, + difficult, truncated, height, width): + """Build an Example proto for an example. + + Args: + filename: string, path to an image file, e.g., '/path/to/example.JPG' + image_buffer: string, JPEG encoding of RGB image + bboxes: List of bounding boxes for each image + labels: List of labels for bounding box + labels_text: List of labels' name for bounding box + difficult: List of ints indicate the difficulty of that bounding box + truncated: List of ints indicate the truncation of that bounding box + height: integer, image height in pixels + width: integer, image width in pixels + Returns: + Example proto + """ + ymin = [] + xmin = [] + ymax = [] + xmax = [] + for b in bboxes: + assert len(b) == 4 + # pylint: disable=expression-not-assigned + [l.append(point) for l, point in zip([ymin, xmin, ymax, xmax], b)] + # pylint: enable=expression-not-assigned + channels = 3 + image_format = 'JPEG' + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': _int64_feature(height), + 'image/width': _int64_feature(width), + 'image/channels': _int64_feature(channels), + 'image/shape': _int64_feature([height, width, channels]), + 'image/object/bbox/xmin': _float_feature(xmin), + 'image/object/bbox/xmax': _float_feature(xmax), + 'image/object/bbox/ymin': _float_feature(ymin), + 'image/object/bbox/ymax': _float_feature(ymax), + 'image/object/bbox/label': _int64_feature(labels), + 'image/object/bbox/label_text': _bytes_list_feature(labels_text), + 'image/object/bbox/difficult': _int64_feature(difficult), + 'image/object/bbox/truncated': _int64_feature(truncated), + 'image/format': _bytes_feature(image_format), + 'image/filename': _bytes_feature(image_name.encode('utf8')), + 'image/encoded': _bytes_feature(image_buffer)})) + return example + + +class ImageCoder(object): + """Helper class that provides TensorFlow image coding utilities.""" + + def __init__(self): + # Create a single Session to run all image coding calls. + self._sess = tf.Session() + + # Initializes function that converts PNG to JPEG data. + self._png_data = tf.placeholder(dtype=tf.string) + image = tf.image.decode_png(self._png_data, channels=3) + self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100) + + # Initializes function that converts CMYK JPEG data to RGB JPEG data. + self._cmyk_data = tf.placeholder(dtype=tf.string) + image = tf.image.decode_jpeg(self._cmyk_data, channels=0) + self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100) + + # Initializes function that decodes RGB JPEG data. + self._decode_jpeg_data = tf.placeholder(dtype=tf.string) + self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3) + + def png_to_jpeg(self, image_data): + return self._sess.run(self._png_to_jpeg, + feed_dict={self._png_data: image_data}) + + def cmyk_to_rgb(self, image_data): + return self._sess.run(self._cmyk_to_rgb, + feed_dict={self._cmyk_data: image_data}) + + def decode_jpeg(self, image_data): + image = self._sess.run(self._decode_jpeg, + feed_dict={self._decode_jpeg_data: image_data}) + assert len(image.shape) == 3 + assert image.shape[2] == 3 + return image + + +def _process_image(filename, coder): + """Process a single image file. + + Args: + filename: string, path to an image file e.g., '/path/to/example.JPG'. + coder: instance of ImageCoder to provide TensorFlow image coding utils. + Returns: + image_buffer: string, JPEG encoding of RGB image. + height: integer, image height in pixels. + width: integer, image width in pixels. + """ + # Read the image file. + with tf.gfile.FastGFile(filename, 'rb') as f: + image_data = f.read() + + # Decode the RGB JPEG. + image = coder.decode_jpeg(image_data) + + # Check that image converted to RGB + assert len(image.shape) == 3 + height = image.shape[0] + width = image.shape[1] + assert image.shape[2] == 3 + + return image_data, height, width + +def _find_image_bounding_boxes(directory, cur_record): + """Find the bounding boxes for a given image file. + + Args: + directory: string; the path of all datas. + cur_record: list of strings; the first of which is the sub-directory of cur_record, the second is the image filename. + Returns: + bboxes: List of bounding boxes for each image. + labels: List of labels for bounding box. + labels_text: List of labels' name for bounding box. + difficult: List of ints indicate the difficulty of that bounding box. + truncated: List of ints indicate the truncation of that bounding box. + """ + anna_file = os.path.join(directory, cur_record[0], 'Annotations', cur_record[1].replace('jpg', 'xml')) + + tree = xml_tree.parse(anna_file) + root = tree.getroot() + + # Image shape. + size = root.find('size') + shape = [int(size.find('height').text), + int(size.find('width').text), + int(size.find('depth').text)] + # Find annotations. + bboxes = [] + labels = [] + labels_text = [] + difficult = [] + truncated = [] + for obj in root.findall('object'): + label = obj.find('name').text + labels.append(int(dataset_common.VOC_LABELS[label][0])) + labels_text.append(label.encode('ascii')) + + isdifficult = obj.find('difficult') + if isdifficult is not None: + difficult.append(int(isdifficult.text)) + else: + difficult.append(0) + + istruncated = obj.find('truncated') + if istruncated is not None: + truncated.append(int(istruncated.text)) + else: + truncated.append(0) + + bbox = obj.find('bndbox') + bboxes.append((float(bbox.find('ymin').text) / shape[0], + float(bbox.find('xmin').text) / shape[1], + float(bbox.find('ymax').text) / shape[0], + float(bbox.find('xmax').text) / shape[1] + )) + return bboxes, labels, labels_text, difficult, truncated + +def _process_image_files_batch(coder, thread_index, ranges, name, directory, all_records, num_shards): + """Processes and saves list of images as TFRecord in 1 thread. + + Args: + coder: instance of ImageCoder to provide TensorFlow image coding utils. + thread_index: integer, unique batch to run index is within [0, len(ranges)). + ranges: list of pairs of integers specifying ranges of each batches to + analyze in parallel. + name: string, unique identifier specifying the data set + directory: string; the path of all datas + all_records: list of string tuples; the first of each tuple is the sub-directory of the record, the second is the image filename. + num_shards: integer number of shards for this data set. + """ + # Each thread produces N shards where N = int(num_shards / num_threads). + # For instance, if num_shards = 128, and the num_threads = 2, then the first + # thread would produce shards [0, 64). + num_threads = len(ranges) + assert not num_shards % num_threads + num_shards_per_batch = int(num_shards / num_threads) + + shard_ranges = np.linspace(ranges[thread_index][0], + ranges[thread_index][1], + num_shards_per_batch + 1).astype(int) + num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0] + + counter = 0 + for s in range(num_shards_per_batch): + # Generate a sharded version of the file name, e.g. 'train-00002-of-00010' + shard = thread_index * num_shards_per_batch + s + output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) + output_file = os.path.join(FLAGS.output_directory, output_filename) + writer = tf.python_io.TFRecordWriter(output_file) + + shard_counter = 0 + files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) + for i in files_in_shard: + cur_record = all_records[i] + filename = os.path.join(directory, cur_record[0], 'JPEGImages', cur_record[1]) + + bboxes, labels, labels_text, difficult, truncated = _find_image_bounding_boxes(directory, cur_record) + image_buffer, height, width = _process_image(filename, coder) + + example = _convert_to_example(filename, cur_record[1], image_buffer, bboxes, labels, labels_text, + difficult, truncated, height, width) + writer.write(example.SerializeToString()) + shard_counter += 1 + counter += 1 + + if not counter % 1000: + print('%s [thread %d]: Processed %d of %d images in thread batch.' % + (datetime.now(), thread_index, counter, num_files_in_thread)) + sys.stdout.flush() + + writer.close() + print('%s [thread %d]: Wrote %d images to %s' % + (datetime.now(), thread_index, shard_counter, output_file)) + sys.stdout.flush() + shard_counter = 0 + print('%s [thread %d]: Wrote %d images to %d shards.' % + (datetime.now(), thread_index, counter, num_files_in_thread)) + sys.stdout.flush() + +def _process_image_files(name, directory, all_records, num_shards): + """Process and save list of images as TFRecord of Example protos. + + Args: + name: string, unique identifier specifying the data set + directory: string; the path of all datas + all_records: list of string tuples; the first of each tuple is the sub-directory of the record, the second is the image filename. + num_shards: integer number of shards for this data set. + """ + # Break all images into batches with a [ranges[i][0], ranges[i][1]]. + spacing = np.linspace(0, len(all_records), FLAGS.num_threads + 1).astype(np.int) + ranges = [] + threads = [] + for i in range(len(spacing) - 1): + ranges.append([spacing[i], spacing[i + 1]]) + + # Launch a thread for each batch. + print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges)) + sys.stdout.flush() + + # Create a mechanism for monitoring when all threads are finished. + coord = tf.train.Coordinator() + + # Create a generic TensorFlow-based utility for converting all image codings. + coder = ImageCoder() + + threads = [] + for thread_index in range(len(ranges)): + args = (coder, thread_index, ranges, name, directory, all_records, num_shards) + t = threading.Thread(target=_process_image_files_batch, args=args) + t.start() + threads.append(t) + + # Wait for all the threads to terminate. + coord.join(threads) + print('%s: Finished writing all %d images in data set.' % + (datetime.now(), len(all_records))) + sys.stdout.flush() + +def _process_dataset(name, directory, all_splits, num_shards): + """Process a complete data set and save it as a TFRecord. + + Args: + name: string, unique identifier specifying the data set. + directory: string, root path to the data set. + all_splits: list of strings, sub-path to the data set. + num_shards: integer number of shards for this data set. + """ + all_records = [] + for split in all_splits: + jpeg_file_path = os.path.join(directory, split, 'JPEGImages') + images = tf.gfile.ListDirectory(jpeg_file_path) + jpegs = [im_name for im_name in images if im_name.strip()[-3:]=='jpg'] + all_records.extend(list(zip([split] * len(jpegs), jpegs))) + + shuffled_index = list(range(len(all_records))) + random.seed(RANDOM_SEED) + random.shuffle(shuffled_index) + all_records = [all_records[i] for i in shuffled_index] + _process_image_files(name, directory, all_records, num_shards) + +def parse_comma_list(args): + return [s.strip() for s in args.split(',')] + +def main(unused_argv): + assert not FLAGS.train_shards % FLAGS.num_threads, ( + 'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards') + assert not FLAGS.validation_shards % FLAGS.num_threads, ( + 'Please make the FLAGS.num_threads commensurate with ' + 'FLAGS.validation_shards') + print('Saving results to %s' % FLAGS.output_directory) + + # Run it! + _process_dataset('val', FLAGS.dataset_directory, parse_comma_list(FLAGS.validation_splits), FLAGS.validation_shards) + _process_dataset('train', FLAGS.dataset_directory, parse_comma_list(FLAGS.train_splits), FLAGS.train_shards) + +if __name__ == '__main__': + tf.app.run() diff --git a/cv/detection/ssd/tensorflow/dataset/convert_voc_sample_tfrecords.py b/cv/detection/ssd/tensorflow/dataset/convert_voc_sample_tfrecords.py new file mode 100644 index 0000000000000000000000000000000000000000..6fe35a4a67467fc5b0a8aea2f237548915722dee --- /dev/null +++ b/cv/detection/ssd/tensorflow/dataset/convert_voc_sample_tfrecords.py @@ -0,0 +1,401 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from datetime import datetime +import os +import random +import sys +import threading +import xml.etree.ElementTree as xml_tree + +import numpy as np +import six + +import tensorflow.compat.v1 as tf + +import dataset_common +tf.disable_eager_execution() + +'''How to organize your dataset folder: + VOCROOT/ + |->VOC2007/ + | |->Annotations/ + | |->ImageSets/ + | |->... + |->VOC2012/ + | |->Annotations/ + | |->ImageSets/ + | |->... + |->VOC2007TEST/ + | |->Annotations/ + | |->... +''' +tf.app.flags.DEFINE_string('dataset_directory', '/media/rs/7A0EE8880EE83EAF/Detections/PASCAL/VOC', + 'All datas directory') +tf.app.flags.DEFINE_string('train_splits', 'VOC2012', + 'Comma-separated list of the training data sub-directory') +tf.app.flags.DEFINE_string('validation_splits', 'VOC2012', + 'Comma-separated list of the validation data sub-directory') +tf.app.flags.DEFINE_string('output_directory', '/media/rs/7A0EE8880EE83EAF/Detections/SSD/dataset/tfrecords', + 'Output data directory') +tf.app.flags.DEFINE_integer('train_shards', 16, + 'Number of shards in training TFRecord files.') +tf.app.flags.DEFINE_integer('validation_shards', 16, + 'Number of shards in validation TFRecord files.') +tf.app.flags.DEFINE_integer('num_threads', 8, + 'Number of threads to preprocess the images.') +RANDOM_SEED = 180428 + +FLAGS = tf.app.flags.FLAGS + +def _int64_feature(value): + """Wrapper for inserting int64 features into Example proto.""" + if not isinstance(value, list): + value = [value] + return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + + +def _float_feature(value): + """Wrapper for inserting float features into Example proto.""" + if not isinstance(value, list): + value = [value] + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + +def _bytes_list_feature(value): + """Wrapper for inserting a list of bytes features into Example proto. + """ + if not isinstance(value, list): + value = [value] + return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) + +def _bytes_feature(value): + """Wrapper for inserting bytes features into Example proto.""" + if isinstance(value, six.string_types): + value = six.binary_type(value, encoding='utf-8') + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + +def _convert_to_example(filename, image_name, image_buffer, bboxes, labels, labels_text, + difficult, truncated, height, width): + """Build an Example proto for an example. + + Args: + filename: string, path to an image file, e.g., '/path/to/example.JPG' + image_buffer: string, JPEG encoding of RGB image + bboxes: List of bounding boxes for each image + labels: List of labels for bounding box + labels_text: List of labels' name for bounding box + difficult: List of ints indicate the difficulty of that bounding box + truncated: List of ints indicate the truncation of that bounding box + height: integer, image height in pixels + width: integer, image width in pixels + Returns: + Example proto + """ + ymin = [] + xmin = [] + ymax = [] + xmax = [] + for b in bboxes: + assert len(b) == 4 + # pylint: disable=expression-not-assigned + [l.append(point) for l, point in zip([ymin, xmin, ymax, xmax], b)] + # pylint: enable=expression-not-assigned + channels = 3 + image_format = 'JPEG' + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': _int64_feature(height), + 'image/width': _int64_feature(width), + 'image/channels': _int64_feature(channels), + 'image/shape': _int64_feature([height, width, channels]), + 'image/object/bbox/xmin': _float_feature(xmin), + 'image/object/bbox/xmax': _float_feature(xmax), + 'image/object/bbox/ymin': _float_feature(ymin), + 'image/object/bbox/ymax': _float_feature(ymax), + 'image/object/bbox/label': _int64_feature(labels), + 'image/object/bbox/label_text': _bytes_list_feature(labels_text), + 'image/object/bbox/difficult': _int64_feature(difficult), + 'image/object/bbox/truncated': _int64_feature(truncated), + 'image/format': _bytes_feature(image_format), + 'image/filename': _bytes_feature(image_name.encode('utf8')), + 'image/encoded': _bytes_feature(image_buffer)})) + return example + + +class ImageCoder(object): + """Helper class that provides TensorFlow image coding utilities.""" + + def __init__(self): + # Create a single Session to run all image coding calls. + self._sess = tf.Session() + + # Initializes function that converts PNG to JPEG data. + self._png_data = tf.placeholder(dtype=tf.string) + image = tf.image.decode_png(self._png_data, channels=3) + self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100) + + # Initializes function that converts CMYK JPEG data to RGB JPEG data. + self._cmyk_data = tf.placeholder(dtype=tf.string) + image = tf.image.decode_jpeg(self._cmyk_data, channels=0) + self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100) + + # Initializes function that decodes RGB JPEG data. + self._decode_jpeg_data = tf.placeholder(dtype=tf.string) + self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3) + + def png_to_jpeg(self, image_data): + return self._sess.run(self._png_to_jpeg, + feed_dict={self._png_data: image_data}) + + def cmyk_to_rgb(self, image_data): + return self._sess.run(self._cmyk_to_rgb, + feed_dict={self._cmyk_data: image_data}) + + def decode_jpeg(self, image_data): + image = self._sess.run(self._decode_jpeg, + feed_dict={self._decode_jpeg_data: image_data}) + assert len(image.shape) == 3 + assert image.shape[2] == 3 + return image + + +def _process_image(filename, coder): + """Process a single image file. + + Args: + filename: string, path to an image file e.g., '/path/to/example.JPG'. + coder: instance of ImageCoder to provide TensorFlow image coding utils. + Returns: + image_buffer: string, JPEG encoding of RGB image. + height: integer, image height in pixels. + width: integer, image width in pixels. + """ + # Read the image file. + with tf.gfile.FastGFile(filename, 'rb') as f: + image_data = f.read() + + # Decode the RGB JPEG. + image = coder.decode_jpeg(image_data) + + # Check that image converted to RGB + assert len(image.shape) == 3 + height = image.shape[0] + width = image.shape[1] + assert image.shape[2] == 3 + + return image_data, height, width + +def _find_image_bounding_boxes(directory, cur_record): + """Find the bounding boxes for a given image file. + + Args: + directory: string; the path of all datas. + cur_record: list of strings; the first of which is the sub-directory of cur_record, the second is the image filename. + Returns: + bboxes: List of bounding boxes for each image. + labels: List of labels for bounding box. + labels_text: List of labels' name for bounding box. + difficult: List of ints indicate the difficulty of that bounding box. + truncated: List of ints indicate the truncation of that bounding box. + """ + anna_file = os.path.join(directory, cur_record[0], 'Annotations', cur_record[1].replace('jpg', 'xml')) + + tree = xml_tree.parse(anna_file) + root = tree.getroot() + + # Image shape. + size = root.find('size') + shape = [int(size.find('height').text), + int(size.find('width').text), + int(size.find('depth').text)] + # Find annotations. + bboxes = [] + labels = [] + labels_text = [] + difficult = [] + truncated = [] + for obj in root.findall('object'): + label = obj.find('name').text + labels.append(int(dataset_common.VOC_LABELS[label][0])) + labels_text.append(label.encode('ascii')) + + isdifficult = obj.find('difficult') + if isdifficult is not None: + difficult.append(int(isdifficult.text)) + else: + difficult.append(0) + + istruncated = obj.find('truncated') + if istruncated is not None: + truncated.append(int(istruncated.text)) + else: + truncated.append(0) + + bbox = obj.find('bndbox') + bboxes.append((float(bbox.find('ymin').text) / shape[0], + float(bbox.find('xmin').text) / shape[1], + float(bbox.find('ymax').text) / shape[0], + float(bbox.find('xmax').text) / shape[1] + )) + return bboxes, labels, labels_text, difficult, truncated + +def _process_image_files_batch(coder, thread_index, ranges, name, directory, all_records, num_shards): + """Processes and saves list of images as TFRecord in 1 thread. + + Args: + coder: instance of ImageCoder to provide TensorFlow image coding utils. + thread_index: integer, unique batch to run index is within [0, len(ranges)). + ranges: list of pairs of integers specifying ranges of each batches to + analyze in parallel. + name: string, unique identifier specifying the data set + directory: string; the path of all datas + all_records: list of string tuples; the first of each tuple is the sub-directory of the record, the second is the image filename. + num_shards: integer number of shards for this data set. + """ + # Each thread produces N shards where N = int(num_shards / num_threads). + # For instance, if num_shards = 128, and the num_threads = 2, then the first + # thread would produce shards [0, 64). + num_threads = len(ranges) + assert not num_shards % num_threads + num_shards_per_batch = int(num_shards / num_threads) + + shard_ranges = np.linspace(ranges[thread_index][0], + ranges[thread_index][1], + num_shards_per_batch + 1).astype(int) + num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0] + + counter = 0 + for s in range(num_shards_per_batch): + # Generate a sharded version of the file name, e.g. 'train-00002-of-00010' + shard = thread_index * num_shards_per_batch + s + output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) + output_file = os.path.join(FLAGS.output_directory, output_filename) + writer = tf.python_io.TFRecordWriter(output_file) + + shard_counter = 0 + files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) + for i in files_in_shard: + cur_record = all_records[i] + filename = os.path.join(directory, cur_record[0], 'JPEGImages', cur_record[1]) + + bboxes, labels, labels_text, difficult, truncated = _find_image_bounding_boxes(directory, cur_record) + image_buffer, height, width = _process_image(filename, coder) + + example = _convert_to_example(filename, cur_record[1], image_buffer, bboxes, labels, labels_text, + difficult, truncated, height, width) + writer.write(example.SerializeToString()) + shard_counter += 1 + counter += 1 + + if not counter % 1000: + print('%s [thread %d]: Processed %d of %d images in thread batch.' % + (datetime.now(), thread_index, counter, num_files_in_thread)) + sys.stdout.flush() + + writer.close() + print('%s [thread %d]: Wrote %d images to %s' % + (datetime.now(), thread_index, shard_counter, output_file)) + sys.stdout.flush() + shard_counter = 0 + print('%s [thread %d]: Wrote %d images to %d shards.' % + (datetime.now(), thread_index, counter, num_files_in_thread)) + sys.stdout.flush() + +def _process_image_files(name, directory, all_records, num_shards): + """Process and save list of images as TFRecord of Example protos. + + Args: + name: string, unique identifier specifying the data set + directory: string; the path of all datas + all_records: list of string tuples; the first of each tuple is the sub-directory of the record, the second is the image filename. + num_shards: integer number of shards for this data set. + """ + # Break all images into batches with a [ranges[i][0], ranges[i][1]]. + spacing = np.linspace(0, len(all_records), FLAGS.num_threads + 1).astype(np.int) + ranges = [] + threads = [] + for i in range(len(spacing) - 1): + ranges.append([spacing[i], spacing[i + 1]]) + + # Launch a thread for each batch. + print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges)) + sys.stdout.flush() + + # Create a mechanism for monitoring when all threads are finished. + coord = tf.train.Coordinator() + + # Create a generic TensorFlow-based utility for converting all image codings. + coder = ImageCoder() + + threads = [] + for thread_index in range(len(ranges)): + args = (coder, thread_index, ranges, name, directory, all_records, num_shards) + t = threading.Thread(target=_process_image_files_batch, args=args) + t.start() + threads.append(t) + + # Wait for all the threads to terminate. + coord.join(threads) + print('%s: Finished writing all %d images in data set.' % + (datetime.now(), len(all_records))) + sys.stdout.flush() + +def _process_dataset(name, directory, all_splits, num_shards): + """Process a complete data set and save it as a TFRecord. + + Args: + name: string, unique identifier specifying the data set. + directory: string, root path to the data set. + all_splits: list of strings, sub-path to the data set. + num_shards: integer number of shards for this data set. + """ + all_records = [] + for split in all_splits: + image_names_file = os.path.join(directory, split, 'ImageSets/Main', name + '.txt') + with open(image_names_file) as f: + image_names = f.readlines() + images = [_in.strip() + '.jpg' for _in in image_names] + print(f"split {split} | name {name} | num images {len(images)}") + # jpeg_file_path = os.path.join(directory, split, 'JPEGImages') + # images = tf.gfile.ListDirectory(jpeg_file_path) + jpegs = [im_name for im_name in images if im_name.strip()[-3:]=='jpg'] + all_records.extend(list(zip([split] * len(jpegs), jpegs))) + + shuffled_index = list(range(len(all_records))) + random.seed(RANDOM_SEED) + random.shuffle(shuffled_index) + all_records = [all_records[i] for i in shuffled_index] + _process_image_files(name, directory, all_records, num_shards) + +def parse_comma_list(args): + return [s.strip() for s in args.split(',')] + +def main(unused_argv): + assert not FLAGS.train_shards % FLAGS.num_threads, ( + 'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards') + assert not FLAGS.validation_shards % FLAGS.num_threads, ( + 'Please make the FLAGS.num_threads commensurate with ' + 'FLAGS.validation_shards') + print('Saving results to %s' % FLAGS.output_directory) + + # Run it! + _process_dataset('val', FLAGS.dataset_directory, parse_comma_list(FLAGS.validation_splits), FLAGS.validation_shards) + _process_dataset('train', FLAGS.dataset_directory, parse_comma_list(FLAGS.train_splits), FLAGS.train_shards) + +if __name__ == '__main__': + tf.app.run() diff --git a/cv/detection/ssd/tensorflow/dataset/dataset_common.py b/cv/detection/ssd/tensorflow/dataset/dataset_common.py new file mode 100644 index 0000000000000000000000000000000000000000..9c17c0eea470df2d18c119b195e5313782f78aed --- /dev/null +++ b/cv/detection/ssd/tensorflow/dataset/dataset_common.py @@ -0,0 +1,238 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf +import tf_slim as slim + + +VOC_LABELS = { + 'none': (0, 'Background'), + 'aeroplane': (1, 'Vehicle'), + 'bicycle': (2, 'Vehicle'), + 'bird': (3, 'Animal'), + 'boat': (4, 'Vehicle'), + 'bottle': (5, 'Indoor'), + 'bus': (6, 'Vehicle'), + 'car': (7, 'Vehicle'), + 'cat': (8, 'Animal'), + 'chair': (9, 'Indoor'), + 'cow': (10, 'Animal'), + 'diningtable': (11, 'Indoor'), + 'dog': (12, 'Animal'), + 'horse': (13, 'Animal'), + 'motorbike': (14, 'Vehicle'), + 'person': (15, 'Person'), + 'pottedplant': (16, 'Indoor'), + 'sheep': (17, 'Animal'), + 'sofa': (18, 'Indoor'), + 'train': (19, 'Vehicle'), + 'tvmonitor': (20, 'Indoor'), +} + +COCO_LABELS = { + "bench": (14, 'outdoor') , + "skateboard": (37, 'sports') , + "toothbrush": (80, 'indoor') , + "person": (1, 'person') , + "donut": (55, 'food') , + "none": (0, 'background') , + "refrigerator": (73, 'appliance') , + "horse": (18, 'animal') , + "elephant": (21, 'animal') , + "book": (74, 'indoor') , + "car": (3, 'vehicle') , + "keyboard": (67, 'electronic') , + "cow": (20, 'animal') , + "microwave": (69, 'appliance') , + "traffic light": (10, 'outdoor') , + "tie": (28, 'accessory') , + "dining table": (61, 'furniture') , + "toaster": (71, 'appliance') , + "baseball glove": (36, 'sports') , + "giraffe": (24, 'animal') , + "cake": (56, 'food') , + "handbag": (27, 'accessory') , + "scissors": (77, 'indoor') , + "bowl": (46, 'kitchen') , + "couch": (58, 'furniture') , + "chair": (57, 'furniture') , + "boat": (9, 'vehicle') , + "hair drier": (79, 'indoor') , + "airplane": (5, 'vehicle') , + "pizza": (54, 'food') , + "backpack": (25, 'accessory') , + "kite": (34, 'sports') , + "sheep": (19, 'animal') , + "umbrella": (26, 'accessory') , + "stop sign": (12, 'outdoor') , + "truck": (8, 'vehicle') , + "skis": (31, 'sports') , + "sandwich": (49, 'food') , + "broccoli": (51, 'food') , + "wine glass": (41, 'kitchen') , + "surfboard": (38, 'sports') , + "sports ball": (33, 'sports') , + "cell phone": (68, 'electronic') , + "dog": (17, 'animal') , + "bed": (60, 'furniture') , + "toilet": (62, 'furniture') , + "fire hydrant": (11, 'outdoor') , + "oven": (70, 'appliance') , + "zebra": (23, 'animal') , + "tv": (63, 'electronic') , + "potted plant": (59, 'furniture') , + "parking meter": (13, 'outdoor') , + "spoon": (45, 'kitchen') , + "bus": (6, 'vehicle') , + "laptop": (64, 'electronic') , + "cup": (42, 'kitchen') , + "bird": (15, 'animal') , + "sink": (72, 'appliance') , + "remote": (66, 'electronic') , + "bicycle": (2, 'vehicle') , + "tennis racket": (39, 'sports') , + "baseball bat": (35, 'sports') , + "cat": (16, 'animal') , + "fork": (43, 'kitchen') , + "suitcase": (29, 'accessory') , + "snowboard": (32, 'sports') , + "clock": (75, 'indoor') , + "apple": (48, 'food') , + "mouse": (65, 'electronic') , + "bottle": (40, 'kitchen') , + "frisbee": (30, 'sports') , + "carrot": (52, 'food') , + "bear": (22, 'animal') , + "hot dog": (53, 'food') , + "teddy bear": (78, 'indoor') , + "knife": (44, 'kitchen') , + "train": (7, 'vehicle') , + "vase": (76, 'indoor') , + "banana": (47, 'food') , + "motorcycle": (4, 'vehicle') , + "orange": (50, 'food') + } + +# use dataset_inspect.py to get these summary +data_splits_num = { + 'train': 22136, + 'val': 4952, +} + +def slim_get_batch(num_classes, batch_size, split_name, file_pattern, num_readers, num_preprocessing_threads, image_preprocessing_fn, anchor_encoder, num_epochs=None, is_training=True): + """Gets a dataset tuple with instructions for reading Pascal VOC dataset. + + Args: + num_classes: total class numbers in dataset. + batch_size: the size of each batch. + split_name: 'train' of 'val'. + file_pattern: The file pattern to use when matching the dataset sources (full path). + num_readers: the max number of reader used for reading tfrecords. + num_preprocessing_threads: the max number of threads used to run preprocessing function. + image_preprocessing_fn: the function used to dataset augumentation. + anchor_encoder: the function used to encoder all anchors. + num_epochs: total epoches for iterate this dataset. + is_training: whether we are in traing phase. + + Returns: + A batch of [image, shape, loc_targets, cls_targets, match_scores]. + """ + if split_name not in data_splits_num: + raise ValueError('split name %s was not recognized.' % split_name) + + # Features in Pascal VOC TFRecords. + keys_to_features = { + 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), + 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), + 'image/filename': tf.FixedLenFeature((), tf.string, default_value=''), + 'image/height': tf.FixedLenFeature([1], tf.int64), + 'image/width': tf.FixedLenFeature([1], tf.int64), + 'image/channels': tf.FixedLenFeature([1], tf.int64), + 'image/shape': tf.FixedLenFeature([3], tf.int64), + 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), + 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), + 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), + } + items_to_handlers = { + 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), + 'filename': slim.tfexample_decoder.Tensor('image/filename'), + 'shape': slim.tfexample_decoder.Tensor('image/shape'), + 'object/bbox': slim.tfexample_decoder.BoundingBox( + ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), + 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), + 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), + 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), + } + decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) + + labels_to_names = {} + for name, pair in VOC_LABELS.items(): + labels_to_names[pair[0]] = name + + dataset = slim.dataset.Dataset( + data_sources=file_pattern, + reader=tf.TFRecordReader, + decoder=decoder, + num_samples=data_splits_num[split_name], + items_to_descriptions=None, + num_classes=num_classes, + labels_to_names=labels_to_names) + + with tf.name_scope('dataset_data_provider'): + provider = slim.dataset_data_provider.DatasetDataProvider( + dataset, + num_readers=num_readers, + common_queue_capacity=32 * batch_size, + common_queue_min=8 * batch_size, + shuffle=is_training, + num_epochs=num_epochs) + + [org_image, filename, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(['image', 'filename', 'shape', + 'object/label', + 'object/bbox', + 'object/difficult']) + + if is_training: + # if all is difficult, then keep the first one + isdifficult_mask =tf.cond(tf.count_nonzero(isdifficult, dtype=tf.int32) < tf.shape(isdifficult)[0], + lambda : isdifficult < tf.ones_like(isdifficult), + lambda : tf.one_hot(0, tf.shape(isdifficult)[0], on_value=True, off_value=False, dtype=tf.bool)) + + glabels_raw = tf.boolean_mask(glabels_raw, isdifficult_mask) + gbboxes_raw = tf.boolean_mask(gbboxes_raw, isdifficult_mask) + + # Pre-processing image, labels and bboxes. + + if is_training: + image, glabels, gbboxes = image_preprocessing_fn(org_image, glabels_raw, gbboxes_raw) + else: + image = image_preprocessing_fn(org_image, glabels_raw, gbboxes_raw) + glabels, gbboxes = glabels_raw, gbboxes_raw + + gt_targets, gt_labels, gt_scores = anchor_encoder(glabels, gbboxes) + + return tf.train.batch([image, filename, shape, gt_targets, gt_labels, gt_scores], + dynamic_pad=False, + batch_size=batch_size, + allow_smaller_final_batch=(not is_training), + num_threads=num_preprocessing_threads, + capacity=64 * batch_size) diff --git a/cv/detection/ssd/tensorflow/dataset/dataset_inspect.py b/cv/detection/ssd/tensorflow/dataset/dataset_inspect.py new file mode 100644 index 0000000000000000000000000000000000000000..a94e6a6880ff4f92970e4832bad51e18c870f1fe --- /dev/null +++ b/cv/detection/ssd/tensorflow/dataset/dataset_inspect.py @@ -0,0 +1,35 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import tensorflow as tf + +def count_split_examples(split_path, file_prefix='.tfrecord'): + # Count the total number of examples in all of these shard + num_samples = 0 + tfrecords_to_count = tf.gfile.Glob(os.path.join(split_path, file_prefix)) + opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB) + for tfrecord_file in tfrecords_to_count: + for record in tf.python_io.tf_record_iterator(tfrecord_file):#, options = opts): + num_samples += 1 + return num_samples + +if __name__ == '__main__': + print('train:', count_split_examples('/media/rs/7A0EE8880EE83EAF/Detections/SSD/dataset/tfrecords', 'train-?????-of-?????')) + print('val:', count_split_examples('/media/rs/7A0EE8880EE83EAF/Detections/SSD/dataset/tfrecords', 'val-?????-of-?????')) diff --git a/cv/detection/ssd/tensorflow/demo/demo1.jpg b/cv/detection/ssd/tensorflow/demo/demo1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e0ca8c5edf9d87e70894f369c16c76c5e22e2752 Binary files /dev/null and b/cv/detection/ssd/tensorflow/demo/demo1.jpg differ diff --git a/cv/detection/ssd/tensorflow/demo/demo2.jpg b/cv/detection/ssd/tensorflow/demo/demo2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..568105fe8152e73710f3ae3e90deaca8a47fc60c Binary files /dev/null and b/cv/detection/ssd/tensorflow/demo/demo2.jpg differ diff --git a/cv/detection/ssd/tensorflow/demo/demo3.jpg b/cv/detection/ssd/tensorflow/demo/demo3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d486a47fde1a9ba826f3db372f3589d61bd20567 Binary files /dev/null and b/cv/detection/ssd/tensorflow/demo/demo3.jpg differ diff --git a/cv/detection/ssd/tensorflow/eval_ssd.py b/cv/detection/ssd/tensorflow/eval_ssd.py new file mode 100644 index 0000000000000000000000000000000000000000..1a064b8a773d65bdcad27d6b629f1e09994ac25c --- /dev/null +++ b/cv/detection/ssd/tensorflow/eval_ssd.py @@ -0,0 +1,457 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +import tensorflow as tf + +import numpy as np + +from net import ssd_net + +from dataset import dataset_common +from preprocessing import ssd_preprocessing +from utility import anchor_manipulator +from utility import scaffolds + +# hardware related configuration +tf.app.flags.DEFINE_integer( + 'num_readers', 8, + 'The number of parallel readers that read data from the dataset.') +tf.app.flags.DEFINE_integer( + 'num_preprocessing_threads', 24, + 'The number of threads used to create the batches.') +tf.app.flags.DEFINE_integer( + 'num_cpu_threads', 0, + 'The number of cpu cores used to train.') +tf.app.flags.DEFINE_float( + 'gpu_memory_fraction', 1., 'GPU memory fraction to use.') +# scaffold related configuration +tf.app.flags.DEFINE_string( + 'data_dir', './dataset/tfrecords', + 'The directory where the dataset input data is stored.') +tf.app.flags.DEFINE_integer( + 'num_classes', 21, 'Number of classes to use in the dataset.') +tf.app.flags.DEFINE_string( + 'model_dir', './logs/', + 'The directory where the model will be stored.') +tf.app.flags.DEFINE_integer( + 'log_every_n_steps', 10, + 'The frequency with which logs are printed.') +tf.app.flags.DEFINE_integer( + 'save_summary_steps', 500, + 'The frequency with which summaries are saved, in seconds.') +# model related configuration +tf.app.flags.DEFINE_integer( + 'train_image_size', 300, + 'The size of the input image for the model to use.') +tf.app.flags.DEFINE_integer( + 'train_epochs', 1, + 'The number of epochs to use for training.') +tf.app.flags.DEFINE_integer( + 'batch_size', 1, + 'Batch size for training and evaluation.') +tf.app.flags.DEFINE_string( + 'data_format', 'channels_last', # 'channels_first' or 'channels_last' + 'A flag to override the data format used in the model. channels_first ' + 'provides a performance boost on GPU but is not always compatible ' + 'with CPU. If left unspecified, the data format will be chosen ' + 'automatically based on whether TensorFlow was built for CPU or GPU.') +tf.app.flags.DEFINE_float( + 'negative_ratio', 3., 'Negative ratio in the loss function.') +tf.app.flags.DEFINE_float( + 'match_threshold', 0.5, 'Matching threshold in the loss function.') +tf.app.flags.DEFINE_float( + 'neg_threshold', 0.5, 'Matching threshold for the negtive examples in the loss function.') +tf.app.flags.DEFINE_float( + 'select_threshold', 0.01, 'Class-specific confidence score threshold for selecting a box.') +tf.app.flags.DEFINE_float( + 'min_size', 0.03, 'The min size of bboxes to keep.') +tf.app.flags.DEFINE_float( + 'nms_threshold', 0.45, 'Matching threshold in NMS algorithm.') +tf.app.flags.DEFINE_integer( + 'nms_topk', 200, 'Number of total object to keep after NMS.') +tf.app.flags.DEFINE_integer( + 'keep_topk', 400, 'Number of total object to keep for each image before nms.') +# optimizer related configuration +tf.app.flags.DEFINE_float( + 'weight_decay', 5e-4, 'The weight decay on the model weights.') +# checkpoint related configuration +tf.app.flags.DEFINE_string( + 'checkpoint_path', './model', + 'The path to a checkpoint from which to fine-tune.') +tf.app.flags.DEFINE_string( + 'model_scope', 'ssd300', + 'Model scope name used to replace the name_scope in checkpoint.') + +FLAGS = tf.app.flags.FLAGS +#CUDA_VISIBLE_DEVICES + +def get_checkpoint(): + if tf.train.latest_checkpoint(FLAGS.model_dir): + tf.logging.info('Ignoring --checkpoint_path because a checkpoint already exists in %s' % FLAGS.model_dir) + return None + + if tf.gfile.IsDirectory(FLAGS.checkpoint_path): + checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) + else: + checkpoint_path = FLAGS.checkpoint_path + + return checkpoint_path + +# couldn't find better way to pass params from input_fn to model_fn +# some tensors used by model_fn must be created in input_fn to ensure they are in the same graph +# but when we put these tensors to labels's dict, the replicate_model_fn will split them into each GPU +# the problem is that they shouldn't be splited +global_anchor_info = dict() + +def input_pipeline(dataset_pattern='train-*', is_training=True, batch_size=FLAGS.batch_size): + def input_fn(): + out_shape = [FLAGS.train_image_size] * 2 + anchor_creator = anchor_manipulator.AnchorCreator(out_shape, + layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], + anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], + extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], + anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], + #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], + layer_steps = [8, 16, 32, 64, 100, 300]) + all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() + + num_anchors_per_layer = [] + for ind in range(len(all_anchors)): + num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) + + anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6, + positive_threshold = FLAGS.match_threshold, + ignore_threshold = FLAGS.neg_threshold, + prior_scaling=[0.1, 0.1, 0.2, 0.2]) + + image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False) + anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) + + image, filename, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes, + batch_size, + ('train' if is_training else 'val'), + os.path.join(FLAGS.data_dir, dataset_pattern), + FLAGS.num_readers, + FLAGS.num_preprocessing_threads, + image_preprocessing_fn, + anchor_encoder_fn, + num_epochs=FLAGS.train_epochs, + is_training=is_training) + global global_anchor_info + global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer), + 'num_anchors_per_layer': num_anchors_per_layer, + 'all_num_anchors_depth': all_num_anchors_depth } + + return {'image': image, 'filename': filename, 'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}, None + return input_fn + +def modified_smooth_l1(bbox_pred, bbox_targets, bbox_inside_weights=1., bbox_outside_weights=1., sigma=1.): + """ + ResultLoss = outside_weights * SmoothL1(inside_weights * (bbox_pred - bbox_targets)) + SmoothL1(x) = 0.5 * (sigma * x)^2, if |x| < 1 / sigma^2 + |x| - 0.5 / sigma^2, otherwise + """ + with tf.name_scope('smooth_l1', [bbox_pred, bbox_targets]): + sigma2 = sigma * sigma + + inside_mul = tf.multiply(bbox_inside_weights, tf.subtract(bbox_pred, bbox_targets)) + + smooth_l1_sign = tf.cast(tf.less(tf.abs(inside_mul), 1.0 / sigma2), tf.float32) + smooth_l1_option1 = tf.multiply(tf.multiply(inside_mul, inside_mul), 0.5 * sigma2) + smooth_l1_option2 = tf.subtract(tf.abs(inside_mul), 0.5 / sigma2) + smooth_l1_result = tf.add(tf.multiply(smooth_l1_option1, smooth_l1_sign), + tf.multiply(smooth_l1_option2, tf.abs(tf.subtract(smooth_l1_sign, 1.0)))) + + outside_mul = tf.multiply(bbox_outside_weights, smooth_l1_result) + + return outside_mul + +def select_bboxes(scores_pred, bboxes_pred, num_classes, select_threshold): + selected_bboxes = {} + selected_scores = {} + with tf.name_scope('select_bboxes', [scores_pred, bboxes_pred]): + for class_ind in range(1, num_classes): + class_scores = scores_pred[:, class_ind] + select_mask = class_scores > select_threshold + + select_mask = tf.cast(select_mask, tf.float32) + selected_bboxes[class_ind] = tf.multiply(bboxes_pred, tf.expand_dims(select_mask, axis=-1)) + selected_scores[class_ind] = tf.multiply(class_scores, select_mask) + + return selected_bboxes, selected_scores + +def clip_bboxes(ymin, xmin, ymax, xmax, name): + with tf.name_scope(name, 'clip_bboxes', [ymin, xmin, ymax, xmax]): + ymin = tf.maximum(ymin, 0.) + xmin = tf.maximum(xmin, 0.) + ymax = tf.minimum(ymax, 1.) + xmax = tf.minimum(xmax, 1.) + + ymin = tf.minimum(ymin, ymax) + xmin = tf.minimum(xmin, xmax) + + return ymin, xmin, ymax, xmax + +def filter_bboxes(scores_pred, ymin, xmin, ymax, xmax, min_size, name): + with tf.name_scope(name, 'filter_bboxes', [scores_pred, ymin, xmin, ymax, xmax]): + width = xmax - xmin + height = ymax - ymin + + filter_mask = tf.logical_and(width > min_size, height > min_size) + + filter_mask = tf.cast(filter_mask, tf.float32) + return tf.multiply(ymin, filter_mask), tf.multiply(xmin, filter_mask), \ + tf.multiply(ymax, filter_mask), tf.multiply(xmax, filter_mask), tf.multiply(scores_pred, filter_mask) + +def sort_bboxes(scores_pred, ymin, xmin, ymax, xmax, keep_topk, name): + with tf.name_scope(name, 'sort_bboxes', [scores_pred, ymin, xmin, ymax, xmax]): + cur_bboxes = tf.shape(scores_pred)[0] + scores, idxes = tf.nn.top_k(scores_pred, k=tf.minimum(keep_topk, cur_bboxes), sorted=True) + + ymin, xmin, ymax, xmax = tf.gather(ymin, idxes), tf.gather(xmin, idxes), tf.gather(ymax, idxes), tf.gather(xmax, idxes) + + paddings_scores = tf.expand_dims(tf.stack([0, tf.maximum(keep_topk-cur_bboxes, 0)], axis=0), axis=0) + + return tf.pad(ymin, paddings_scores, "CONSTANT"), tf.pad(xmin, paddings_scores, "CONSTANT"),\ + tf.pad(ymax, paddings_scores, "CONSTANT"), tf.pad(xmax, paddings_scores, "CONSTANT"),\ + tf.pad(scores, paddings_scores, "CONSTANT") + +def nms_bboxes(scores_pred, bboxes_pred, nms_topk, nms_threshold, name): + with tf.name_scope(name, 'nms_bboxes', [scores_pred, bboxes_pred]): + idxes = tf.image.non_max_suppression(bboxes_pred, scores_pred, nms_topk, nms_threshold) + return tf.gather(scores_pred, idxes), tf.gather(bboxes_pred, idxes) + +def parse_by_class(cls_pred, bboxes_pred, num_classes, select_threshold, min_size, keep_topk, nms_topk, nms_threshold): + with tf.name_scope('select_bboxes', [cls_pred, bboxes_pred]): + scores_pred = tf.nn.softmax(cls_pred) + selected_bboxes, selected_scores = select_bboxes(scores_pred, bboxes_pred, num_classes, select_threshold) + for class_ind in range(1, num_classes): + ymin, xmin, ymax, xmax = tf.unstack(selected_bboxes[class_ind], 4, axis=-1) + #ymin, xmin, ymax, xmax = tf.split(selected_bboxes[class_ind], 4, axis=-1) + #ymin, xmin, ymax, xmax = tf.squeeze(ymin), tf.squeeze(xmin), tf.squeeze(ymax), tf.squeeze(xmax) + ymin, xmin, ymax, xmax = clip_bboxes(ymin, xmin, ymax, xmax, 'clip_bboxes_{}'.format(class_ind)) + ymin, xmin, ymax, xmax, selected_scores[class_ind] = filter_bboxes(selected_scores[class_ind], + ymin, xmin, ymax, xmax, min_size, 'filter_bboxes_{}'.format(class_ind)) + ymin, xmin, ymax, xmax, selected_scores[class_ind] = sort_bboxes(selected_scores[class_ind], + ymin, xmin, ymax, xmax, keep_topk, 'sort_bboxes_{}'.format(class_ind)) + selected_bboxes[class_ind] = tf.stack([ymin, xmin, ymax, xmax], axis=-1) + selected_scores[class_ind], selected_bboxes[class_ind] = nms_bboxes(selected_scores[class_ind], selected_bboxes[class_ind], nms_topk, nms_threshold, 'nms_bboxes_{}'.format(class_ind)) + + return selected_bboxes, selected_scores + +def ssd_model_fn(features, labels, mode, params): + """model_fn for SSD to be used with our Estimator.""" + filename = features['filename'] + shape = features['shape'] + loc_targets = features['loc_targets'] + cls_targets = features['cls_targets'] + match_scores = features['match_scores'] + features = features['image'] + + global global_anchor_info + decode_fn = global_anchor_info['decode_fn'] + num_anchors_per_layer = global_anchor_info['num_anchors_per_layer'] + all_num_anchors_depth = global_anchor_info['all_num_anchors_depth'] + + with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): + backbone = ssd_net.VGG16Backbone(params['data_format']) + feature_layers = backbone.forward(features, training=(mode == tf.estimator.ModeKeys.TRAIN)) + #print(feature_layers) + location_pred, cls_pred = ssd_net.multibox_head(feature_layers, params['num_classes'], all_num_anchors_depth, data_format=params['data_format']) + if params['data_format'] == 'channels_first': + cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] + location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred] + + cls_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, params['num_classes']]) for pred in cls_pred] + location_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, 4]) for pred in location_pred] + + cls_pred = tf.concat(cls_pred, axis=1) + location_pred = tf.concat(location_pred, axis=1) + + cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) + location_pred = tf.reshape(location_pred, [-1, 4]) + + with tf.device('/cpu:0'): + bboxes_pred = decode_fn(location_pred) + bboxes_pred = tf.concat(bboxes_pred, axis=0) + selected_bboxes, selected_scores = parse_by_class(cls_pred, bboxes_pred, + params['num_classes'], params['select_threshold'], params['min_size'], + params['keep_topk'], params['nms_topk'], params['nms_threshold']) + + predictions = {'filename': filename, 'shape': shape } + for class_ind in range(1, params['num_classes']): + predictions['scores_{}'.format(class_ind)] = tf.expand_dims(selected_scores[class_ind], axis=0) + predictions['bboxes_{}'.format(class_ind)] = tf.expand_dims(selected_bboxes[class_ind], axis=0) + + flaten_cls_targets = tf.reshape(cls_targets, [-1]) + flaten_match_scores = tf.reshape(match_scores, [-1]) + flaten_loc_targets = tf.reshape(loc_targets, [-1, 4]) + + # each positive examples has one label + positive_mask = flaten_cls_targets > 0 + n_positives = tf.count_nonzero(positive_mask) + + batch_n_positives = tf.count_nonzero(cls_targets, -1) + + batch_negtive_mask = tf.equal(cls_targets, 0)#tf.logical_and(tf.equal(cls_targets, 0), match_scores > 0.) + batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1) + + batch_n_neg_select = tf.cast(params['negative_ratio'] * tf.cast(batch_n_positives, tf.float32), tf.int32) + batch_n_neg_select = tf.minimum(batch_n_neg_select, tf.cast(batch_n_negtives, tf.int32)) + + # hard negative mining for classification + predictions_for_bg = tf.nn.softmax(tf.reshape(cls_pred, [tf.shape(features)[0], -1, params['num_classes']]))[:, :, 0] + prob_for_negtives = tf.where(batch_negtive_mask, + 0. - predictions_for_bg, + # ignore all the positives + 0. - tf.ones_like(predictions_for_bg)) + topk_prob_for_bg, _ = tf.nn.top_k(prob_for_negtives, k=tf.shape(prob_for_negtives)[1]) + score_at_k = tf.gather_nd(topk_prob_for_bg, tf.stack([tf.range(tf.shape(features)[0]), batch_n_neg_select - 1], axis=-1)) + + selected_neg_mask = prob_for_negtives >= tf.expand_dims(score_at_k, axis=-1) + + # include both selected negtive and all positive examples + final_mask = tf.stop_gradient(tf.logical_or(tf.reshape(tf.logical_and(batch_negtive_mask, selected_neg_mask), [-1]), positive_mask)) + total_examples = tf.count_nonzero(final_mask) + + cls_pred = tf.boolean_mask(cls_pred, final_mask) + location_pred = tf.boolean_mask(location_pred, tf.stop_gradient(positive_mask)) + flaten_cls_targets = tf.boolean_mask(tf.clip_by_value(flaten_cls_targets, 0, params['num_classes']), final_mask) + flaten_loc_targets = tf.stop_gradient(tf.boolean_mask(flaten_loc_targets, positive_mask)) + + # Calculate loss, which includes softmax cross entropy and L2 regularization. + #cross_entropy = (params['negative_ratio'] + 1.) * tf.cond(n_positives > 0, lambda: tf.losses.sparse_softmax_cross_entropy(labels=glabels, logits=cls_pred), lambda: 0.) + cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=flaten_cls_targets, logits=cls_pred) * (params['negative_ratio'] + 1.) + # Create a tensor named cross_entropy for logging purposes. + tf.identity(cross_entropy, name='cross_entropy_loss') + tf.summary.scalar('cross_entropy_loss', cross_entropy) + + #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred)) + loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.) + loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1), name='location_loss') + tf.summary.scalar('location_loss', loc_loss) + tf.losses.add_loss(loc_loss) + + # Add weight decay to the loss. We exclude the batch norm variables because + # doing so leads to a small improvement in accuracy. + total_loss = tf.add(cross_entropy, loc_loss, name='total_loss') + + cls_accuracy = tf.metrics.accuracy(flaten_cls_targets, tf.argmax(cls_pred, axis=-1)) + + # Create a tensor named train_accuracy for logging purposes. + tf.identity(cls_accuracy[1], name='cls_accuracy') + tf.summary.scalar('cls_accuracy', cls_accuracy[1]) + + summary_hook = tf.train.SummarySaverHook(save_steps=params['save_summary_steps'], + output_dir=params['summary_dir'], + summary_op=tf.summary.merge_all()) + if mode == tf.estimator.ModeKeys.PREDICT: + return tf.estimator.EstimatorSpec( + mode=mode, + predictions=predictions, + prediction_hooks=[summary_hook], + loss=None, train_op=None) + else: + raise ValueError('This script only support "PREDICT" mode!') + +def parse_comma_list(args): + return [float(s.strip()) for s in args.split(',')] + +def main(_): + # Using the Winograd non-fused algorithms provides a small performance boost. + os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' + + gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=FLAGS.num_cpu_threads, inter_op_parallelism_threads=FLAGS.num_cpu_threads, gpu_options=gpu_options) + + # Set up a RunConfig to only save checkpoints once per training cycle. + run_config = tf.estimator.RunConfig().replace( + save_checkpoints_secs=None).replace( + save_checkpoints_steps=None).replace( + save_summary_steps=FLAGS.save_summary_steps).replace( + keep_checkpoint_max=5).replace( + log_step_count_steps=FLAGS.log_every_n_steps).replace( + session_config=config) + + summary_dir = os.path.join(FLAGS.model_dir, 'predict') + + ssd_detector = tf.estimator.Estimator( + model_fn=ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, + params={ + 'select_threshold': FLAGS.select_threshold, + 'min_size': FLAGS.min_size, + 'nms_threshold': FLAGS.nms_threshold, + 'nms_topk': FLAGS.nms_topk, + 'keep_topk': FLAGS.keep_topk, + 'data_format': FLAGS.data_format, + 'batch_size': FLAGS.batch_size, + 'model_scope': FLAGS.model_scope, + 'save_summary_steps': FLAGS.save_summary_steps, + 'summary_dir': summary_dir, + 'num_classes': FLAGS.num_classes, + 'negative_ratio': FLAGS.negative_ratio, + 'match_threshold': FLAGS.match_threshold, + 'neg_threshold': FLAGS.neg_threshold, + 'weight_decay': FLAGS.weight_decay, + }) + tensors_to_log = { + 'ce': 'cross_entropy_loss', + 'loc': 'location_loss', + 'loss': 'total_loss', + 'acc': 'cls_accuracy', + } + logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=FLAGS.log_every_n_steps, + formatter=lambda dicts: (', '.join(['%s=%.6f' % (k, v) for k, v in dicts.items()]))) + + print('Starting a predict cycle.') + pred_results = ssd_detector.predict(input_fn=input_pipeline(dataset_pattern='val-*', is_training=False, batch_size=FLAGS.batch_size), + hooks=[logging_hook], checkpoint_path=get_checkpoint())#, yield_single_examples=False) + + det_results = list(pred_results) + #print(list(det_results)) + + #[{'bboxes_1': array([[0. , 0. , 0.28459054, 0.5679505 ], [0.3158835 , 0.34792888, 0.7312541 , 1. ]], dtype=float32), 'scores_17': array([0.01333667, 0.01152573], dtype=float32), 'filename': b'000703.jpg', 'shape': array([334, 500, 3])}] + for class_ind in range(1, FLAGS.num_classes): + with open(os.path.join(summary_dir, 'results_{}.txt'.format(class_ind)), 'wt') as f: + for image_ind, pred in enumerate(det_results): + filename = pred['filename'] + shape = pred['shape'] + scores = pred['scores_{}'.format(class_ind)] + bboxes = pred['bboxes_{}'.format(class_ind)] + bboxes[:, 0] = (bboxes[:, 0] * shape[0]).astype(np.int32, copy=False) + 1 + bboxes[:, 1] = (bboxes[:, 1] * shape[1]).astype(np.int32, copy=False) + 1 + bboxes[:, 2] = (bboxes[:, 2] * shape[0]).astype(np.int32, copy=False) + 1 + bboxes[:, 3] = (bboxes[:, 3] * shape[1]).astype(np.int32, copy=False) + 1 + + valid_mask = np.logical_and((bboxes[:, 2] - bboxes[:, 0] > 0), (bboxes[:, 3] - bboxes[:, 1] > 0)) + + for det_ind in range(valid_mask.shape[0]): + if not valid_mask[det_ind]: + continue + f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. + format(filename.decode('utf8')[:-4], scores[det_ind], + bboxes[det_ind, 1], bboxes[det_ind, 0], + bboxes[det_ind, 3], bboxes[det_ind, 2])) + + +if __name__ == '__main__': + tf.logging.set_verbosity(tf.logging.INFO) + tf.app.run() diff --git a/cv/detection/ssd/tensorflow/net/common_ops.py b/cv/detection/ssd/tensorflow/net/common_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..06eed648e66e0e6d0895d4ad5ea33cafba91400e --- /dev/null +++ b/cv/detection/ssd/tensorflow/net/common_ops.py @@ -0,0 +1,471 @@ +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +import math +import numpy as np +import tensorflow.compat.v1 as tf +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops + + +def fixed_padding(inputs, kernel_size, rate=1): + kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), + kernel_size[1] + (kernel_size[1] - 1) * (rate - 1)] + + pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] + pad_beg = [pad_total[0] // 2, pad_total[1] // 2] + pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] + + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], + [pad_beg[1], pad_end[1]], [0, 0]]) + return padded_inputs + + +def conv2d( + inputs, + filters, + kernel_size=3, + stride=1, + padding='SAME', + use_bias=True, + kernel_initializer=tf.keras.initializers.VarianceScaling( + scale=2.0, mode='fan_in', distribution='truncated_normal'), + bias_initializer=tf.constant_initializer(0.0), + dilation_rate=1, + scope=None): + with tf.variable_scope(scope, default_name='Conv2D') as s, \ + tf.name_scope(s.original_name_scope): + in_channel = inputs.get_shape().as_list()[3] + if type(kernel_size).__name__ == 'int': + kernel_shape = [kernel_size, kernel_size, in_channel, filters] + else: + kernel_shape = kernel_size + [in_channel, filters] + + kernel = tf.get_variable( + 'kernel', shape=kernel_shape, dtype=tf.float32, + initializer=kernel_initializer, trainable=True) + + if padding.lower() == 'same': + inputs = fixed_padding(inputs, kernel_size=kernel_shape[0:2], rate=dilation_rate) + + if dilation_rate > 1: + outputs = tf.nn.atrous_conv2d(inputs, kernel, rate=dilation_rate, padding='VALID') + else: + strides = [1, stride, stride, 1] + outputs = tf.nn.conv2d( + inputs, kernel, strides=strides, padding='VALID', + use_cudnn_on_gpu=True, name='convolution') + if use_bias: + b = tf.get_variable( + 'bias', shape=[filters], dtype=tf.float32, + initializer=bias_initializer, trainable=True) + outputs = tf.nn.bias_add(outputs, b) + return outputs + + +def depthwise_conv2d( + inputs, + filters=None, + kernel_size=3, + stride=1, + padding='SAME', + depth_multiplier=1, + use_bias=False, + kernel_initializer=tf.keras.initializers.VarianceScaling( + scale=2.0, mode='fan_in', distribution='truncated_normal'), + bias_initializer=tf.constant_initializer(0.0), + dilation_rate=1, + scope=None): + with tf.variable_scope(scope, default_name='DepthwiseConv2D') as s, \ + tf.name_scope(s.original_name_scope): + in_channel = inputs.get_shape().as_list()[3] + if filters: + assert filters % in_channel == 0 + depth_multiplier = filters // in_channel + if type(kernel_size).__name__ == 'int': + kernel_shape = [kernel_size, kernel_size, in_channel, depth_multiplier] + else: + kernel_shape = kernel_size + [in_channel, depth_multiplier] + + kernel = tf.get_variable( + 'depthwise_kernel', shape=kernel_shape, dtype=tf.float32, + initializer=kernel_initializer, trainable=True) + + if padding.lower() == 'same': + inputs = fixed_padding(inputs, kernel_size=kernel_shape[0:2], rate=dilation_rate) + + if dilation_rate > 1: + strides = [1, 1, 1, 1] + outputs = tf.nn.depthwise_conv2d( + inputs, kernel, strides=strides, padding='VALID', rate=dilation_rate) + else: + strides = [1, stride, stride, 1] + # param filter of tf.nn.depthwise_conv2d: [filter_height, filter_width, in_channels, channel_multiplier] + outputs = tf.nn.depthwise_conv2d( + inputs, kernel, strides=strides, padding='VALID', rate=None) + if use_bias: + b = tf.get_variable( + 'depthwise_bias', shape=[in_channel], dtype=tf.float32, + initializer=bias_initializer, trainable=True) + outputs = tf.nn.bias_add(outputs, b) + return outputs + + +# Flatten the tensor except the first dimension. +def _batch_flatten(x): + shape = x.get_shape().as_list()[1:] + if None not in shape: + return tf.reshape(x, [-1, int(np.prod(shape))]) + return tf.reshape(x, tf.stack([tf.shape(x)[0], -1])) + + +def fully_connected( + inputs, + filters, + use_bias=True, + kernel_initializer=tf.keras.initializers.VarianceScaling( + scale=2.0, mode='fan_in', distribution='truncated_normal'), + bias_initializer=tf.constant_initializer(0.0), + scope=None): + with tf.variable_scope(scope, default_name='Conv2D') as s, \ + tf.name_scope(s.original_name_scope): + inputs = _batch_flatten(inputs) + in_channel = inputs.get_shape().as_list()[1] + kernel_shape = [in_channel, filters] + + kernel = tf.get_variable('kernel', shape=kernel_shape, dtype=tf.float32, initializer=kernel_initializer, trainable=True) + outputs = tf.matmul(inputs, kernel) + if use_bias: + b = tf.get_variable('bias', shape=[filters], dtype=tf.float32, initializer=bias_initializer, trainable=True) + outputs = tf.nn.bias_add(outputs, b) + return outputs + + +def get_normalizer_fn( + norm_name, + is_training, + bn_decay=0.99, + ema_update=True, + r_max=3, + d_max=5, + group=8): + + def normalizer_fn(inputs, scope=''): + if 'batch_norm' == norm_name.lower(): + if type(is_training) is tf.Tensor: + return tf.cond( + is_training, + lambda: batch_normalization(inputs=inputs, is_training=True, + bn_decay=bn_decay, ema_update=ema_update, scope=scope, reuse=None), + lambda: batch_normalization(inputs=inputs, is_training=False, + bn_decay=bn_decay, ema_update=ema_update, scope=scope, reuse=True) + ) + else: + return batch_normalization(inputs=inputs, is_training=is_training, + bn_decay=bn_decay, ema_update=ema_update, scope=scope, reuse=None) + elif 'batch_renorm' == norm_name.lower(): + if type(is_training) is tf.Tensor: + return tf.cond( + is_training, + lambda: batch_renormalization(inputs=inputs, is_training=True, + r_max=r_max, d_max=d_max, bn_decay=bn_decay, + ema_update=ema_update, scope=scope, reuse=None), + lambda: batch_renormalization(inputs=inputs, is_training=False, + r_max=r_max, d_max=d_max, bn_decay=bn_decay, + ema_update=ema_update, scope=scope, reuse=True) + ) + else: + return batch_renormalization(inputs=inputs, is_training=is_training, + r_max=r_max, d_max=d_max, bn_decay=bn_decay, + ema_update=ema_update, scope=scope, reuse=None) + elif 'group_norm' == norm_name.lower(): + if type(is_training) is tf.Tensor: + return group_norm(inputs, is_training=True, group=group, scope=scope) + else: + return group_norm(inputs, is_training=is_training, group=group, scope=scope) + elif 'instance_norm' == norm_name.lower(): + if type(is_training) is tf.Tensor: + return instance_norm(inputs, is_training=True, scope=scope) + else: + return instance_norm(inputs, is_training=is_training, scope=scope) + else: + return tf.identity(inputs) + + return normalizer_fn + + +def moments(x, axes, keep_dims=False, is_training=False, name=None): + with ops.name_scope(name, "moments", [x, axes]): + mean = math_ops.reduce_mean(x, axes, keepdims=True, name="mean") + if is_training: + squared_difference = math_ops.squared_difference(x, array_ops.stop_gradient(mean)) + else: + squared_difference = math_ops.squared_difference(x, mean) + variance = math_ops.reduce_mean(squared_difference, axes, keepdims=True, name="variance") + if not keep_dims: + mean = array_ops.squeeze(mean, axes) + variance = array_ops.squeeze(variance, axes) + return (mean, variance) + + +def batch_normalization( + inputs, + is_training, + epsilon=1e-5, + bn_decay=0.99, + ema_update=True, + scope=None, + reuse=None): + + if scope: + scope = scope + '/BatchNorm' + with tf.variable_scope(scope, default_name='BatchNorm', reuse=reuse) as s, \ + tf.name_scope(s.original_name_scope): + C = inputs.get_shape().as_list()[3] + # gamma: a trainable scale factor + gamma = tf.get_variable("gamma", [C], + initializer=tf.constant_initializer(1.0), trainable=True) + # beta: a trainable shift value + beta = tf.get_variable("beta", [C], + initializer=tf.constant_initializer(0.0), trainable=True) + moving_mean = tf.get_variable("moving_mean", [C], + initializer=tf.constant_initializer(0.0), trainable=False) + moving_variance = tf.get_variable("moving_variance", [C], + initializer=tf.constant_initializer(1.0), trainable=False) + # use batch statistics + if is_training: + mean, var = moments(inputs, [0,1,2], keep_dims=True, is_training=is_training) + mean = tf.reshape(mean, [C]) + var = tf.reshape(var, [C]) + # update moving_mean and moving_variance + if ema_update: + update_moving_mean = tf.assign( + moving_mean, moving_mean * bn_decay + mean * (1 - bn_decay)) + update_moving_variance = tf.assign( + moving_variance, moving_variance * bn_decay + var * (1 - bn_decay)) + control_inputs = [update_moving_mean, update_moving_variance] + else: + control_inputs = [] + with tf.control_dependencies(control_inputs): + output = tf.nn.batch_normalization( + inputs, mean, var, offset=beta, scale=gamma, variance_epsilon=epsilon) + # use EMA statistics + else: + output = tf.nn.batch_normalization( + inputs, moving_mean, moving_variance, offset=beta, scale=gamma, + variance_epsilon=epsilon) + + return output + + +def batch_renormalization( + inputs, + is_training, + r_max=3, + d_max=5, + epsilon=1e-5, + bn_decay=0.99, + ema_update=False, + scope=None, + reuse=None): + + if scope: + scope = scope + '/BatchNorm' + with tf.variable_scope(scope, default_name='BatchNorm', reuse=reuse) as s, \ + tf.name_scope(s.original_name_scope): + C = inputs.get_shape().as_list()[3] + # gamma: a trainable scale factor + gamma = tf.get_variable("gamma", [C], + initializer=tf.constant_initializer(1.0), trainable=True) + # beta: a trainable shift value + beta = tf.get_variable("beta", [C], + initializer=tf.constant_initializer(0.0), trainable=True) + moving_mean = tf.get_variable("moving_mean", [C], + initializer=tf.constant_initializer(0.0), trainable=False) + moving_variance = tf.get_variable("moving_variance", [C], + initializer=tf.constant_initializer(1.0), trainable=False) + # use batch statistics + if is_training: + mean, var = moments(inputs, [0,1,2], keep_dims=True, is_training=is_training) + mean = tf.reshape(mean, [C]) + var = tf.reshape(var, [C]) + std = math_ops.sqrt(var + epsilon) + + r = std / (math_ops.sqrt(moving_variance + epsilon)) + r = array_ops.stop_gradient(tf.clip_by_value(r, 1/r_max, r_max)) + + d = (mean - moving_mean) / math_ops.sqrt(moving_variance + epsilon) + d = array_ops.stop_gradient(tf.clip_by_value(d, -d_max, d_max)) + # update moving_mean and moving_variance + if ema_update: + update_moving_mean = tf.assign(moving_mean, moving_mean * bn_decay + mean * (1 - bn_decay)) + update_moving_variance = tf.assign(moving_variance, moving_variance * bn_decay + var * (1 - bn_decay)) + control_inputs = [update_moving_mean, update_moving_variance] + else: + control_inputs = [] + + batch_normed_output = (inputs - mean) / std + with tf.control_dependencies(control_inputs): + output = (batch_normed_output * r + d) * gamma + beta + # use EMA statistics + else: + output = tf.nn.batch_normalization( + inputs, moving_mean, moving_variance, offset=beta, scale=gamma, + variance_epsilon=epsilon) + + return output + + +def group_norm(inputs, group=8, epsilon=1e-5, is_training=False, scope=None): + if scope: + scope = scope + '/GroupNorm' + with tf.variable_scope(scope, default_name='GroupNorm') as s, \ + tf.name_scope(s.original_name_scope): + C = inputs.get_shape().as_list()[3] + orig_shape = tf.shape(inputs) + H, W = orig_shape[1], orig_shape[2] + G = min(group, C) + + x = tf.reshape(inputs, [-1, H, W, G, C//G]) + mean, var = moments(x, [1, 2, 4], keep_dims=True, is_training=is_training) + + gamma = tf.get_variable('gamma', shape=[C], dtype=tf.float32, + initializer=tf.constant_initializer(1.0), trainable=True) + beta = tf.get_variable('beta', shape=[C], dtype=tf.float32, + initializer=tf.constant_initializer(0.0), trainable=True) + gamma = tf.reshape(gamma, [1, 1, 1, G, C//G]) + beta = tf.reshape(beta, [1, 1, 1, G, C//G]) + + output = tf.nn.batch_normalization( + x, mean, var, offset=beta, scale=gamma, variance_epsilon=epsilon) + output = tf.reshape(output, orig_shape) + return output + + +def instance_norm(inputs, epsilon=1e-5, is_training=False, scope=None): + if scope: + scope = scope + '/InstanceNorm' + with tf.variable_scope(scope, default_name='InstanceNorm') as s, \ + tf.name_scope(s.original_name_scope): + B = tf.shape(inputs)[0] + C = inputs.get_shape().as_list()[-1] + + gamma = tf.get_variable('gamma', shape=[C], dtype=tf.float32, + initializer=tf.constant_initializer(1.0), trainable=True) + beta = tf.get_variable('beta', shape=[C], dtype=tf.float32, + initializer=tf.constant_initializer(0.0), trainable=True) + gamma = tf.reshape(gamma, [1, 1, 1, C]) + beta = tf.reshape(beta, [1, 1, 1, C]) + + mean, var = moments(inputs, [1, 2], keep_dims=True, is_training=is_training) + output = tf.nn.batch_normalization( + inputs, mean, var, offset=beta, scale=gamma, variance_epsilon=epsilon) + return output + + +def relu(x, name="relu"): + return tf.nn.relu(x, name=name) + + +def relu6(x, name="relu6"): + return tf.nn.relu6(x, name=name) + + +# x = tf.where(x < 0.0, leak * x, x) +def leaky_relu(x, leak=0.01, name="leaky_relu"): + return tf.nn.leaky_relu(x, alpha=leak, name=name) + + +# x = tf.where(x > 0.0, x, alpha * tf.exp(x) - alpha) +# alpha =1.0 by default +def elu(x, name='elu'): + return tf.nn.elu(x, name=name) + + +# alpha = 1.6732632423543772848170429916717 +# scale = 1.0507009873554804934193349852946 +# x = scale * tf.where(x > 0.0, x, alpha * tf.exp(x) - alpha) +def selu(x, name='selu'): + return tf.nn.selu(x) + + +def hard_swish(x): + return x * tf.nn.relu6(x + 3.0) / 6.0 + + +# x = tf.clip_by_value(x + 3.0, 0.0, 6.0) / 6.0 +def hard_sigmoid(x): + return tf.nn.relu6(x + 3.0) / 6.0 + + +def sigmoid(x): + return tf.nn.sigmoid(x) + + +def max_pooling(inputs, kernel_size=2, stride=2, padding='SAME', name='MaxPooling'): + if type(kernel_size).__name__ == 'int': + kernel_size = [1, kernel_size, kernel_size, 1] + else: + kernel_size = [1] + kernel_size + [1] + if type(stride).__name__ == 'int': + strides = [1, stride, stride, 1] + else: + strides = [1] + stride + [1] + return tf.nn.max_pool(inputs, ksize=kernel_size, strides=strides, padding=padding, data_format='NHWC', name=name) + + +def avg_pooling(inputs, kernel_size=2, stride=2, padding='SAME', name='AveragePooling'): + if type(kernel_size).__name__ == 'int': + kernel_size = [1, kernel_size, kernel_size, 1] + else: + kernel_size = [1] + kernel_size + [1] + if type(stride).__name__ == 'int': + strides = [1, stride, stride, 1] + else: + strides = [1] + stride + [1] + return tf.nn.avg_pool(inputs, ksize=kernel_size, strides=strides, padding=padding, data_format='NHWC', name=name) + + +def global_avg_pooling(inputs, keep_dims=True, name='GlobalAveragePooling'): + shape = inputs.get_shape().as_list() + if shape[1] is None or shape[2] is None: + output = math_ops.reduce_mean(inputs, [1,2], keepdims=True, name=name) + else: + kernel_size = [1, shape[1], shape[2], 1] + output = tf.nn.avg_pool(inputs, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID', name=name) + return output + + +def get_tensor_size(inputs): + shape = inputs.get_shape().as_list() + if shape[1] is None or shape[2] is None: + return tf.shape(inputs)[1:3] + else: + return shape[1:3] + + +def dropout(inputs, is_training, dropout_ratio=0.0, name='Dropout'): + if type(is_training) is tf.Tensor: + return tf.cond( + is_training, + lambda: tf.nn.dropout(inputs, dropout_ratio, name=name), + lambda: tf.identity(inputs, name=name) + ) + elif is_training: + return tf.nn.dropout(inputs, dropout_ratio, name=name) + else: + return tf.identity(inputs, name=name) diff --git a/cv/detection/ssd/tensorflow/net/resnet.py b/cv/detection/ssd/tensorflow/net/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..6e9b8d255c70f62f504f263e6d168fa63ad23f0c --- /dev/null +++ b/cv/detection/ssd/tensorflow/net/resnet.py @@ -0,0 +1,296 @@ +# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from .common_ops import * + + +def resnet(is_training, num_classes=None, **kwargs): + + version = kwargs.get('version', 'v1') + size = kwargs.get('size', 50) + if version.lower() == 'v1d': + deepbase = True + avg_down = True + else: + deepbase = False + avg_down = False + dropout_ratio = kwargs.get('dropout_ratio', 0.0) + norm_name = kwargs.get('norm_name', 'batch_norm') + bn_decay = kwargs.get('bn_decay', 0.99) + r_max = kwargs.get('r_max', 3) + d_max = kwargs.get('d_max', 5) + use_se = kwargs.get('use_se', False) + se_reduction = kwargs.get('se_reduction', 16) + + stage_blocks = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3], + }[size] + output_channels = { + 18: [64, 128, 256, 512], + 34: [64, 128, 256, 512], + 50: [256, 512, 1024, 2048], + 101: [256, 512, 1024, 2048], + 152: [256, 512, 1024, 2048], + }[size] + num_stages = len(stage_blocks) + + norm_fn = get_normalizer_fn( + norm_name, + is_training=is_training, + bn_decay=0.99, + r_max=r_max, + d_max=d_max + ) + + if 'v1' in version: + net_name = 'ResNetV1' + if size < 50: + block_name = 'resblockv1' + else: + block_name = 'bottleneckv1' + elif 'v2' in version: + net_name = 'ResNetV2' + if size < 50: + block_name = 'resblockv2' + else: + block_name = 'bottleneckv2' + else: + raise NotImplementedError + + def conv_layer(inputs, filters, kernel_size=3, stride=1, padding='SAME', scope=None): + x = conv2d(inputs, filters, kernel_size=kernel_size, stride=stride, + use_bias=False, padding=padding, scope=scope) + x = norm_fn(x, scope=scope) + x = relu(x) + return x + + if use_se: + def squeeze_excite( + inputs, + scope=None): + with tf.variable_scope(scope, default_name='squeeze_excite') as s, \ + tf.name_scope(s.original_name_scope): + in_channel = inputs.get_shape().as_list()[3] + avgpool = global_avg_pooling(inputs, keep_dims=True, name="avgpool") + squeeze = conv2d(avgpool, in_channel//se_reduction, + kernel_size=1, stride=1, use_bias=True, scope='squeeze') + squeeze = relu(squeeze) + excite = conv2d(squeeze, in_channel, + kernel_size=1, stride=1, use_bias=True, scope='excite') + excite = sigmoid(excite) + return inputs * excite + else: + squeeze_excite = None + # A single block for ResNet v1. + if 'resblockv1' == block_name: + def block_fn(inputs, out_channel, kernel_size=3, stride=1, scope=None): + with tf.variable_scope(scope, default_name='ResidualBlockV1') as s, \ + tf.name_scope(s.original_name_scope): + in_channel = inputs.get_shape().as_list()[3] + if in_channel != out_channel: + if avg_down and stride != 1: + shortcut = avg_pooling(inputs, kernel_size=stride, stride=stride) + shortcut = conv2d(shortcut, out_channel, + kernel_size=1, stride=1, use_bias=False, + scope='projection_shortcut') + else: + shortcut = conv2d(inputs, out_channel, + kernel_size=1, stride=stride, use_bias=False, + scope='projection_shortcut') + shortcut = norm_fn(shortcut, scope='projection_shortcut') + else: + shortcut = inputs + x = conv_layer(inputs, out_channel, + kernel_size=kernel_size, stride=stride, scope='conv_1') + x = conv2d(x, out_channel, + kernel_size=kernel_size, stride=1, use_bias=False, scope='conv_2') + x = norm_fn(x, scope='conv_2') + if squeeze_excite: + x = squeeze_excite(x, scope='squeeze_excite') + + return relu(x + shortcut) + # A single block for ResNet v1, with a bottleneck. + # Bottleneck places the stride for downsampling at 3x3 convolution(conv2) + # while original implementation places the stride at the first 1x1 convolution(conv1) + # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. + # This variant is also known as ResNet V1.5 and improves accuracy according to + # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. + if 'bottleneckv1' == block_name: + def block_fn(inputs, out_channel, kernel_size=3, stride=1, reduction=4, scope=None): + with tf.variable_scope(scope, default_name='BottleneckBlockV1') as s, \ + tf.name_scope(s.original_name_scope): + in_channel = inputs.get_shape().as_list()[3] + if in_channel != out_channel: + if avg_down and stride != 1: + shortcut = avg_pooling(inputs, kernel_size=stride, stride=stride) + shortcut = conv2d(shortcut, out_channel, + kernel_size=1, stride=1, use_bias=False, + scope='projection_shortcut') + else: + shortcut = conv2d(inputs, out_channel, + kernel_size=1, stride=stride, use_bias=False, + scope='projection_shortcut') + shortcut = norm_fn(shortcut, scope='projection_shortcut') + else: + shortcut = inputs + x = conv_layer(inputs, out_channel//reduction, + kernel_size=1, stride=1, scope='reduction') + x = conv_layer(x, out_channel//reduction, + kernel_size=kernel_size, stride=stride, scope='bottleneck') + x = conv2d(x, out_channel, + kernel_size=1, stride=1, use_bias=False, scope='expansion') + x = norm_fn(x, scope='expansion') + if squeeze_excite: + x = squeeze_excite(x, scope='squeeze_excite') + return relu(x + shortcut) + # A single block for ResNet v2. + if 'resblockv2' == block_name: + def block_fn(inputs, out_channel, kernel_size=3, stride=1, scope=None): + with tf.variable_scope(scope, default_name='ResidualBlockV2') as s, \ + tf.name_scope(s.original_name_scope): + in_channel = inputs.get_shape().as_list()[3] + if in_channel != out_channel: + if avg_down and stride != 1: + shortcut = avg_pooling(inputs, kernel_size=stride, stride=stride) + shortcut = conv2d(shortcut, out_channel, + kernel_size=1, stride=1, use_bias=False, + scope='projection_shortcut') + else: + shortcut = conv2d(inputs, out_channel, + kernel_size=1, stride=stride, use_bias=False, + scope='projection_shortcut') + else: + shortcut = inputs + x = norm_fn(inputs, scope='norm_inputs') + x = relu(x) + x = conv_layer(x, out_channel, + kernel_size=kernel_size, stride=stride, scope='conv_1') + x = conv2d(x, out_channel, + kernel_size=kernel_size, stride=1, use_bias=False, scope='conv_2') + if squeeze_excite: + x = squeeze_excite(x, scope='squeeze_excite') + + return x + shortcut + # A single block for ResNet v2, with a bottleneck. + if 'bottleneckv2' == block_name: + def block_fn(inputs, out_channel, kernel_size=3, stride=1, reduction=4, scope=None): + with tf.variable_scope(scope, default_name='BottleneckBlockV2') as s, \ + tf.name_scope(s.original_name_scope): + in_channel = inputs.get_shape().as_list()[3] + if in_channel != out_channel: + if avg_down and stride != 1: + shortcut = avg_pooling(inputs, kernel_size=stride, stride=stride) + shortcut = conv2d(shortcut, out_channel, + kernel_size=1, stride=1, use_bias=False, + scope='projection_shortcut') + else: + shortcut = conv2d(inputs, out_channel, + kernel_size=1, stride=stride, use_bias=False, + scope='projection_shortcut') + else: + shortcut = inputs + x = norm_fn(inputs, scope='norm_inputs') + x = relu(x) + x = conv_layer(x, out_channel//reduction, + kernel_size=1, stride=1, scope='reduction') + x = conv_layer(x, out_channel//reduction, + kernel_size=kernel_size, stride=stride, scope='bottleneck') + x = conv2d(x, out_channel, + kernel_size=1, stride=1, use_bias=False, scope='expansion') + if squeeze_excite: + x = squeeze_excite(x, scope='squeeze_excite') + return x + shortcut + + def first_layer(inputs, filters=64, scope='first_layer'): + with tf.variable_scope(scope): + if deepbase: + x = conv_layer(inputs, filters//2, kernel_size=3, stride=2, scope='Conv_1') + x = conv_layer(x, filters//2, kernel_size=3, stride=1, scope='Conv_2') + if net_name == 'ResNetV2': + x = conv2d(x, filters, + kernel_size=3, stride=1, use_bias=False, scope='Conv_3') + else: + x = conv_layer(x, filters, kernel_size=3, stride=1, scope='Conv_3') + else: + if net_name == 'ResNetV2': + x = conv2d(inputs, filters, + kernel_size=7, stride=2, use_bias=False, scope='Conv_1') + else: + x = conv_layer(inputs, filters, kernel_size=7, stride=2, scope='Conv_1') + return x + + end_points = [] + + def forward(inputs): + + with tf.variable_scope(net_name, reuse=tf.AUTO_REUSE): + net = first_layer(inputs, scope='first_layer') + #end_points['down_1'] = net + end_points.append(net) + net = max_pooling(net, kernel_size=3, stride=2) + # stage 1,2,3,4 + for stage_idx in range(num_stages): + stage_scope = 'stage_{}'.format(stage_idx+1) + num_blocks = stage_blocks[stage_idx] + out_channel = output_channels[stage_idx] + with tf.variable_scope(stage_scope): + for block_idx in range(num_blocks): + block_scope = 'block_{}'.format(block_idx+1) + stride = 2 if (stage_idx > 0 and block_idx == 0) else 1 + net = block_fn(net, out_channel, + kernel_size=3, stride=stride, scope=block_scope) + #end_points['down_%d' %(stage_idx+2)] = net + end_points.append(net) + + if net_name == 'ResNetV2': + net = norm_fn(net, scope='postnorm') + net = relu(net) + #end_points['down_%d' %((num_stages-1)+2)] = net + end_points[-1] = net + + if not num_classes: + return end_points + + net = global_avg_pooling(net) + + if dropout_ratio: + net = dropout(net, is_training=is_training, dropout_ratio=dropout_ratio) + + logits = conv2d(net, num_classes, kernel_size=1, use_bias=True, scope='classification') + logits = tf.squeeze(logits, [1, 2], name='logits') + + return logits + + return forward + + +def resnet18(is_training, num_classes=None, **kwargs): + return resnet(is_training, num_classes, size=18, **kwargs) + + +def resnet34(is_training, num_classes=None, **kwargs): + return resnet(is_training, num_classes, size=34, **kwargs) + + +def resnet50(is_training, num_classes=None, **kwargs): + return resnet(is_training, num_classes, size=50, **kwargs) + + +def resnet101(is_training, num_classes=None, **kwargs): + return resnet(is_training, num_classes, size=101, **kwargs) diff --git a/cv/detection/ssd/tensorflow/net/ssd_net.py b/cv/detection/ssd/tensorflow/net/ssd_net.py new file mode 100644 index 0000000000000000000000000000000000000000..c584a7f283597b0490c9f6b9fd519e5b7afbcbe3 --- /dev/null +++ b/cv/detection/ssd/tensorflow/net/ssd_net.py @@ -0,0 +1,336 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from functools import partial +import tensorflow.compat.v1 as tf + +from .resnet import resnet18, resnet34, resnet50, resnet101 + +_BATCH_NORM_DECAY = 0.9 +_BATCH_NORM_EPSILON = 1e-5 +_USE_FUSED_BN = True + +# vgg_16/conv2/conv2_1/biases +# vgg_16/conv4/conv4_3/biases +# vgg_16/conv1/conv1_1/biases +# vgg_16/fc6/weights +# vgg_16/conv3/conv3_2/biases +# vgg_16/conv5/conv5_3/biases +# vgg_16/conv3/conv3_1/weights +# vgg_16/conv4/conv4_2/weights +# vgg_16/conv1/conv1_1/weights +# vgg_16/conv5/conv5_3/weights +# vgg_16/conv4/conv4_1/weights +# vgg_16/conv3/conv3_3/weights +# vgg_16/conv5/conv5_2/biases +# vgg_16/conv3/conv3_2/weights +# vgg_16/conv4/conv4_2/biases +# vgg_16/conv5/conv5_2/weights +# vgg_16/conv3/conv3_1/biases +# vgg_16/conv2/conv2_2/weights +# vgg_16/fc7/weights +# vgg_16/conv5/conv5_1/biases +# vgg_16/conv1/conv1_2/biases +# vgg_16/conv2/conv2_2/biases +# vgg_16/conv4/conv4_1/biases +# vgg_16/fc7/biases +# vgg_16/fc6/biases +# vgg_16/conv4/conv4_3/weights +# vgg_16/conv2/conv2_1/weights +# vgg_16/conv5/conv5_1/weights +# vgg_16/conv3/conv3_3/biases +# vgg_16/conv1/conv1_2/weights + +class ReLuLayer(tf.layers.Layer): + def __init__(self, name, **kwargs): + super(ReLuLayer, self).__init__(name=name, trainable=trainable, **kwargs) + self._name = name + def build(self, input_shape): + self._relu = lambda x : tf.nn.relu(x, name=self._name) + self.built = True + + def call(self, inputs): + return self._relu(inputs) + + def compute_output_shape(self, input_shape): + return tf.TensorShape(input_shape) + + +def forward_module(m, inputs, training=False): + if isinstance(m, tf.layers.BatchNormalization) or isinstance(m, tf.layers.Dropout): + return m.apply(inputs, training=training) + return m.apply(inputs) + + +def get_backbone(backbone, training, **kwargs): + forward_fn = _VGG_BACKBONES[backbone](training, **kwargs) + return forward_fn + + +def get_vgg16(training, **kwargs): + _backbone = VGG16Backbone(**kwargs) + return partial(_backbone.forward, training=training) + + +def get_resnet18(training, **kwargs): + _backbone = resnet18(training, **kwargs) + return _backbone + + +def get_resnet34(training, **kwargs): + _backbone = resnet34(training, **kwargs) + return _backbone + + +def get_resnet50(training, **kwargs): + _backbone = resnet50(training, **kwargs) + return _backbone + + +def get_resnet101(training, **kwargs): + _backbone = resnet101(training, **kwargs) + return _backbone + + +_VGG_BACKBONES = { + 'vgg16': get_vgg16, + 'resnet18': get_resnet18, + 'resnet34': get_resnet34, + 'resnet50': get_resnet50, + 'resnet101': get_resnet101, +} + + +def ssd_conv_block( + filters, strides, name, + data_format, kernel_initializer, + padding='same', reuse=None): + + with tf.variable_scope(name): + conv_blocks = [] + conv_blocks.append( + tf.layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding=padding, + data_format=data_format, activation=tf.nn.relu, use_bias=True, + kernel_initializer=kernel_initializer, + bias_initializer=tf.zeros_initializer(), + name='{}_1'.format(name), _scope='{}_1'.format(name), _reuse=None) + ) + conv_blocks.append( + tf.layers.Conv2D(filters=filters * 2, kernel_size=3, strides=strides, padding=padding, + data_format=data_format, activation=tf.nn.relu, use_bias=True, + kernel_initializer=kernel_initializer, + bias_initializer=tf.zeros_initializer(), + name='{}_2'.format(name), _scope='{}_2'.format(name), _reuse=None) + ) + return conv_blocks + + +class SSDBackbone(object): + def __init__(self, backbone, training, **kwargs): + self.backbone = get_backbone(backbone, training, **kwargs) + self.training = training + self.data_format = kwargs.get('data_format', 'channels_first') + + # SSD layers + with tf.variable_scope('additional_layers') as scope: + # down_5 + self._conv8_block = ssd_conv_block(256, 2, 'conv8', + data_format=self.data_format, + kernel_initializer=tf.glorot_uniform_initializer()) + # down_6 + self._conv9_block = ssd_conv_block(128, 2, 'conv9', + data_format=self.data_format, + kernel_initializer=tf.glorot_uniform_initializer()) + self._conv10_block = ssd_conv_block(128, 1, 'conv10', padding='valid', + data_format=self.data_format, + kernel_initializer=tf.glorot_uniform_initializer()) + self._conv11_block = ssd_conv_block(128, 1, 'conv11', padding='valid', + data_format=self.data_format, + kernel_initializer=tf.glorot_uniform_initializer()) + + def forward(self, inputs): + feature_layers = self.backbone(inputs) + if len(feature_layers) > 2: + feature_layers = [feature_layers[-3], feature_layers[-2]] + else: + feature_layers = [feature_layers[-2], feature_layers[-1]] + inputs = feature_layers[-1] + + # forward ssd layers + for layer in self._conv8_block: + inputs = forward_module(layer, inputs, training=self.training) + # conv8 + feature_layers.append(inputs) + for layer in self._conv9_block: + inputs = forward_module(layer, inputs, training=self.training) + # conv9 + feature_layers.append(inputs) + for layer in self._conv10_block: + inputs = forward_module(layer, inputs, training=self.training) + # conv10 + feature_layers.append(inputs) + for layer in self._conv11_block: + inputs = forward_module(layer, inputs, training=self.training) + # conv11 + feature_layers.append(inputs) + + return feature_layers + + +class VGG16Backbone(object): + def __init__(self, data_format='channels_first'): + super(VGG16Backbone, self).__init__() + self._data_format = data_format + self._bn_axis = -1 if data_format == 'channels_last' else 1 + #initializer = tf.glorot_uniform_initializer glorot_normal_initializer + self._conv_initializer = tf.glorot_uniform_initializer + self._conv_bn_initializer = tf.glorot_uniform_initializer#lambda : tf.truncated_normal_initializer(mean=0.0, stddev=0.005) + # VGG layers + self._conv1_block = self.conv_block(2, 64, 3, (1, 1), 'conv1') + # down_1 + self._pool1 = tf.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool1') + self._conv2_block = self.conv_block(2, 128, 3, (1, 1), 'conv2') + # down_2 + self._pool2 = tf.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool2') + self._conv3_block = self.conv_block(3, 256, 3, (1, 1), 'conv3') + # down_3 + self._pool3 = tf.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool3') + self._conv4_block = self.conv_block(3, 512, 3, (1, 1), 'conv4') + # down_4 + self._pool4 = tf.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool4') + self._conv5_block = self.conv_block(3, 512, 3, (1, 1), 'conv5') + self._pool5 = tf.layers.MaxPooling2D(3, 1, padding='same', data_format=self._data_format, name='pool5') + self._conv6 = tf.layers.Conv2D(filters=1024, kernel_size=3, strides=1, padding='same', dilation_rate=6, + data_format=self._data_format, activation=tf.nn.relu, use_bias=True, + kernel_initializer=self._conv_initializer(), + bias_initializer=tf.zeros_initializer(), + name='fc6', _scope='fc6', _reuse=None) + self._conv7 = tf.layers.Conv2D(filters=1024, kernel_size=1, strides=1, padding='same', + data_format=self._data_format, activation=tf.nn.relu, use_bias=True, + kernel_initializer=self._conv_initializer(), + bias_initializer=tf.zeros_initializer(), + name='fc7', _scope='fc7', _reuse=None) + + def l2_normalize(self, x, name): + with tf.name_scope(name, "l2_normalize", [x]) as name: + axis = -1 if self._data_format == 'channels_last' else 1 + square_sum = tf.reduce_sum(tf.square(x), axis, keep_dims=True) + x_inv_norm = tf.rsqrt(tf.maximum(square_sum, 1e-10)) + return tf.multiply(x, x_inv_norm, name=name) + + def forward(self, inputs, training=False): + # inputs should in BGR + feature_layers = [] + # forward vgg layers + for conv in self._conv1_block: + inputs = forward_module(conv, inputs, training=training) + inputs = self._pool1.apply(inputs) + for conv in self._conv2_block: + inputs = forward_module(conv, inputs, training=training) + inputs = self._pool2.apply(inputs) + for conv in self._conv3_block: + inputs = forward_module(conv, inputs, training=training) + inputs = self._pool3.apply(inputs) + for conv in self._conv4_block: + inputs = forward_module(conv, inputs, training=training) + # conv4_3 + with tf.variable_scope('conv4_3_scale') as scope: + weight_scale = tf.Variable([20.] * 512, trainable=training, name='weights') + if self._data_format == 'channels_last': + weight_scale = tf.reshape(weight_scale, [1, 1, 1, -1], name='reshape') + else: + weight_scale = tf.reshape(weight_scale, [1, -1, 1, 1], name='reshape') + + feature_layers.append(tf.multiply(weight_scale, self.l2_normalize(inputs, name='norm'), name='rescale') + ) + inputs = self._pool4.apply(inputs) + for conv in self._conv5_block: + inputs = forward_module(conv, inputs, training=training) + inputs = self._pool5.apply(inputs) + # forward fc layers + inputs = self._conv6.apply(inputs) + inputs = self._conv7.apply(inputs) + # fc7 + feature_layers.append(inputs) + + return feature_layers + + def conv_block(self, num_blocks, filters, kernel_size, strides, name, reuse=None): + with tf.variable_scope(name): + conv_blocks = [] + for ind in range(1, num_blocks + 1): + conv_blocks.append( + tf.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding='same', + data_format=self._data_format, activation=tf.nn.relu, use_bias=True, + kernel_initializer=self._conv_initializer(), + bias_initializer=tf.zeros_initializer(), + name='{}_{}'.format(name, ind), _scope='{}_{}'.format(name, ind), _reuse=None) + ) + return conv_blocks + + def ssd_conv_bn_block(self, filters, strides, name, reuse=None): + with tf.variable_scope(name): + conv_bn_blocks = [] + conv_bn_blocks.append( + tf.layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding='same', + data_format=self._data_format, activation=None, use_bias=False, + kernel_initializer=self._conv_bn_initializer(), + bias_initializer=None, + name='{}_1'.format(name), _scope='{}_1'.format(name), _reuse=None) + ) + conv_bn_blocks.append( + tf.layers.BatchNormalization(axis=self._bn_axis, momentum=BN_MOMENTUM, epsilon=BN_EPSILON, fused=USE_FUSED_BN, + name='{}_bn1'.format(name), _scope='{}_bn1'.format(name), _reuse=None) + ) + conv_bn_blocks.append( + ReLuLayer('{}_relu1'.format(name), _scope='{}_relu1'.format(name), _reuse=None) + ) + conv_bn_blocks.append( + tf.layers.Conv2D(filters=filters * 2, kernel_size=3, strides=strides, padding='same', + data_format=self._data_format, activation=None, use_bias=False, + kernel_initializer=self._conv_bn_initializer(), + bias_initializer=None, + name='{}_2'.format(name), _scope='{}_2'.format(name), _reuse=None) + ) + conv_bn_blocks.append( + tf.layers.BatchNormalization(axis=self._bn_axis, momentum=BN_MOMENTUM, epsilon=BN_EPSILON, fused=USE_FUSED_BN, + name='{}_bn2'.format(name), _scope='{}_bn2'.format(name), _reuse=None) + ) + conv_bn_blocks.append( + ReLuLayer('{}_relu2'.format(name), _scope='{}_relu2'.format(name), _reuse=None) + ) + return conv_bn_blocks + + +def multibox_head(feature_layers, num_classes, num_anchors_depth_per_layer, data_format='channels_first'): + with tf.variable_scope('multibox_head'): + cls_preds = [] + loc_preds = [] + for ind, feat in enumerate(feature_layers): + loc_preds.append(tf.layers.conv2d(feat, num_anchors_depth_per_layer[ind] * 4, (3, 3), use_bias=True, + name='loc_{}'.format(ind), strides=(1, 1), + padding='same', data_format=data_format, activation=None, + kernel_initializer=tf.glorot_uniform_initializer(), + bias_initializer=tf.zeros_initializer())) + cls_preds.append(tf.layers.conv2d(feat, num_anchors_depth_per_layer[ind] * num_classes, (3, 3), use_bias=True, + name='cls_{}'.format(ind), strides=(1, 1), + padding='same', data_format=data_format, activation=None, + kernel_initializer=tf.glorot_uniform_initializer(), + bias_initializer=tf.zeros_initializer())) + + return loc_preds, cls_preds diff --git a/cv/detection/ssd/tensorflow/preprocessing/preprocessing_unittest.py b/cv/detection/ssd/tensorflow/preprocessing/preprocessing_unittest.py new file mode 100644 index 0000000000000000000000000000000000000000..92e41678aff8f72c97545fd53e03c766f85d2614 --- /dev/null +++ b/cv/detection/ssd/tensorflow/preprocessing/preprocessing_unittest.py @@ -0,0 +1,131 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import tensorflow as tf +from scipy.misc import imread, imsave, imshow, imresize +import numpy as np +import sys; sys.path.insert(0, ".") +from utility import draw_toolbox +import ssd_preprocessing + +slim = tf.contrib.slim + +def save_image_with_bbox(image, labels_, scores_, bboxes_): + if not hasattr(save_image_with_bbox, "counter"): + save_image_with_bbox.counter = 0 # it doesn't exist yet, so initialize it + save_image_with_bbox.counter += 1 + + img_to_draw = np.copy(image) + + img_to_draw = draw_toolbox.bboxes_draw_on_img(img_to_draw, labels_, scores_, bboxes_, thickness=2) + imsave(os.path.join('./debug/{}.jpg').format(save_image_with_bbox.counter), img_to_draw) + return save_image_with_bbox.counter + +def slim_get_split(file_pattern='{}_????'): + # Features in Pascal VOC TFRecords. + keys_to_features = { + 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), + 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), + 'image/filename': tf.FixedLenFeature((), tf.string, default_value=''), + 'image/height': tf.FixedLenFeature([1], tf.int64), + 'image/width': tf.FixedLenFeature([1], tf.int64), + 'image/channels': tf.FixedLenFeature([1], tf.int64), + 'image/shape': tf.FixedLenFeature([3], tf.int64), + 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), + 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), + 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), + } + items_to_handlers = { + 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), + 'filename': slim.tfexample_decoder.Tensor('image/filename'), + 'shape': slim.tfexample_decoder.Tensor('image/shape'), + 'object/bbox': slim.tfexample_decoder.BoundingBox( + ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), + 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), + 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), + 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), + } + decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) + + dataset = slim.dataset.Dataset( + data_sources=file_pattern, + reader=tf.TFRecordReader, + decoder=decoder, + num_samples=100, + items_to_descriptions=None, + num_classes=21, + labels_to_names=None) + + with tf.name_scope('dataset_data_provider'): + provider = slim.dataset_data_provider.DatasetDataProvider( + dataset, + num_readers=2, + common_queue_capacity=32, + common_queue_min=8, + shuffle=True, + num_epochs=1) + + [org_image, filename, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(['image', 'filename', 'shape', + 'object/label', + 'object/bbox', + 'object/difficult']) + image, glabels, gbboxes = ssd_preprocessing.preprocess_image(org_image, glabels_raw, gbboxes_raw, [300, 300], is_training=True, data_format='channels_first', output_rgb=True) + + image = tf.transpose(image, perm=(1, 2, 0)) + save_image_op = tf.py_func(save_image_with_bbox, + [ssd_preprocessing.unwhiten_image(image), + tf.clip_by_value(glabels, 0, tf.int64.max), + tf.ones_like(glabels), + gbboxes], + tf.int64, stateful=True) + return save_image_op + +if __name__ == '__main__': + save_image_op = slim_get_split('/media/rs/7A0EE8880EE83EAF/Detections/SSD/dataset/tfrecords/*') + # Create the graph, etc. + init_op = tf.group([tf.local_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()]) + + # Create a session for running operations in the Graph. + sess = tf.Session() + # Initialize the variables (like the epoch counter). + sess.run(init_op) + + # Start input enqueue threads. + coord = tf.train.Coordinator() + threads = tf.train.start_queue_runners(sess=sess, coord=coord) + + try: + while not coord.should_stop(): + # Run training steps or whatever + print(sess.run(save_image_op)) + + except tf.errors.OutOfRangeError: + print('Done training -- epoch limit reached') + finally: + # When done, ask the threads to stop. + coord.request_stop() + + # Wait for threads to finish. + coord.join(threads) + sess.close() diff --git a/cv/detection/ssd/tensorflow/preprocessing/ssd_preprocessing.py b/cv/detection/ssd/tensorflow/preprocessing/ssd_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..739305df2d18d7eca660a01b4e7cddcab0679fc2 --- /dev/null +++ b/cv/detection/ssd/tensorflow/preprocessing/ssd_preprocessing.py @@ -0,0 +1,521 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Provides utilities to preprocess images. + +The preprocessing steps for VGG were introduced in the following technical +report: + + Very Deep Convolutional Networks For Large-Scale Image Recognition + Karen Simonyan and Andrew Zisserman + arXiv technical report, 2015 + PDF: http://arxiv.org/pdf/1409.1556.pdf + ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf + CC-BY-4.0 + +More information can be obtained from the VGG website: +www.robots.ox.ac.uk/~vgg/research/very_deep/ +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v1 as tf +from tensorflow.python.ops import control_flow_ops +import tf_slim as slim + + +_R_MEAN = 123.68 +_G_MEAN = 116.78 +_B_MEAN = 103.94 + +def _ImageDimensions(image, rank = 3): + """Returns the dimensions of an image tensor. + + Args: + image: A rank-D Tensor. For 3-D of shape: `[height, width, channels]`. + rank: The expected rank of the image + + Returns: + A list of corresponding to the dimensions of the + input image. Dimensions that are statically known are python integers, + otherwise they are integer scalar tensors. + """ + if image.get_shape().is_fully_defined(): + return image.get_shape().as_list() + else: + static_shape = image.get_shape().with_rank(rank).as_list() + dynamic_shape = tf.unstack(tf.shape(image), rank) + return [s if s is not None else d + for s, d in zip(static_shape, dynamic_shape)] + +def apply_with_random_selector(x, func, num_cases): + """Computes func(x, sel), with sel sampled from [0...num_cases-1]. + + Args: + x: input Tensor. + func: Python function to apply. + num_cases: Python int32, number of cases to sample sel from. + + Returns: + The result of func(x, sel), where func receives the value of the + selector as a python integer, but sel is sampled dynamically. + """ + sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32) + # Pass the real x only to one of the func calls. + return control_flow_ops.merge([ + func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case) + for case in range(num_cases)])[0] + + +def distort_color(image, color_ordering=0, fast_mode=True, scope=None): + """Distort the color of a Tensor image. + + Each color distortion is non-commutative and thus ordering of the color ops + matters. Ideally we would randomly permute the ordering of the color ops. + Rather then adding that level of complication, we select a distinct ordering + of color ops for each preprocessing thread. + + Args: + image: 3-D Tensor containing single image in [0, 1]. + color_ordering: Python int, a type of distortion (valid values: 0-3). + fast_mode: Avoids slower ops (random_hue and random_contrast) + scope: Optional scope for name_scope. + Returns: + 3-D Tensor color-distorted image on range [0, 1] + Raises: + ValueError: if color_ordering not in [0, 3] + """ + with tf.name_scope(scope, 'distort_color', [image]): + if fast_mode: + if color_ordering == 0: + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + else: + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_brightness(image, max_delta=32. / 255.) + else: + if color_ordering == 0: + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + elif color_ordering == 1: + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + elif color_ordering == 2: + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_brightness(image, max_delta=32. / 255.) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + elif color_ordering == 3: + image = tf.image.random_hue(image, max_delta=0.2) + image = tf.image.random_saturation(image, lower=0.5, upper=1.5) + image = tf.image.random_contrast(image, lower=0.5, upper=1.5) + image = tf.image.random_brightness(image, max_delta=32. / 255.) + else: + raise ValueError('color_ordering must be in [0, 3]') + + # The random_* ops do not necessarily clamp. + return tf.clip_by_value(image, 0.0, 1.0) + +def ssd_random_sample_patch(image, labels, bboxes, ratio_list=[0.1, 0.3, 0.5, 0.7, 0.9, 1.], name=None): + '''ssd_random_sample_patch. + select one min_iou + sample _width and _height from [0-width] and [0-height] + check if the aspect ratio between 0.5-2. + select left_top point from (width - _width, height - _height) + check if this bbox has a min_iou with all ground_truth bboxes + keep ground_truth those center is in this sampled patch, if none then try again + ''' + def sample_width_height(width, height): + with tf.name_scope('sample_width_height'): + index = 0 + max_attempt = 10 + sampled_width, sampled_height = width, height + + def condition(index, sampled_width, sampled_height, width, height): + return tf.logical_or(tf.logical_and(tf.logical_or(tf.greater(sampled_width, sampled_height * 2), + tf.greater(sampled_height, sampled_width * 2)), + tf.less(index, max_attempt)), + tf.less(index, 1)) + + def body(index, sampled_width, sampled_height, width, height): + sampled_width = tf.random_uniform([1], minval=0.3, maxval=0.999, dtype=tf.float32)[0] * width + sampled_height = tf.random_uniform([1], minval=0.3, maxval=0.999, dtype=tf.float32)[0] *height + + return index+1, sampled_width, sampled_height, width, height + + [index, sampled_width, sampled_height, _, _] = tf.while_loop(condition, body, + [index, sampled_width, sampled_height, width, height], parallel_iterations=4, back_prop=False, swap_memory=True) + + return tf.cast(sampled_width, tf.int32), tf.cast(sampled_height, tf.int32) + + def jaccard_with_anchors(roi, bboxes): + with tf.name_scope('jaccard_with_anchors'): + int_ymin = tf.maximum(roi[0], bboxes[:, 0]) + int_xmin = tf.maximum(roi[1], bboxes[:, 1]) + int_ymax = tf.minimum(roi[2], bboxes[:, 2]) + int_xmax = tf.minimum(roi[3], bboxes[:, 3]) + h = tf.maximum(int_ymax - int_ymin, 0.) + w = tf.maximum(int_xmax - int_xmin, 0.) + inter_vol = h * w + union_vol = (roi[3] - roi[1]) * (roi[2] - roi[0]) + ((bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1]) - inter_vol) + jaccard = tf.div(inter_vol, union_vol) + return jaccard + + def areas(bboxes): + with tf.name_scope('bboxes_areas'): + vol = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]) + return vol + + def check_roi_center(width, height, labels, bboxes): + with tf.name_scope('check_roi_center'): + index = 0 + max_attempt = 20 + roi = [0., 0., 0., 0.] + float_width = tf.cast(width, tf.float32) + float_height = tf.cast(height, tf.float32) + mask = tf.cast(tf.zeros_like(labels, dtype=tf.uint8), tf.bool) + center_x, center_y = (bboxes[:, 1] + bboxes[:, 3]) / 2, (bboxes[:, 0] + bboxes[:, 2]) / 2 + + def condition(index, roi, mask): + return tf.logical_or(tf.logical_and(tf.reduce_sum(tf.cast(mask, tf.int32)) < 1, + tf.less(index, max_attempt)), + tf.less(index, 1)) + + def body(index, roi, mask): + sampled_width, sampled_height = sample_width_height(float_width, float_height) + + x = tf.random_uniform([], minval=0, maxval=width - sampled_width, dtype=tf.int32) + y = tf.random_uniform([], minval=0, maxval=height - sampled_height, dtype=tf.int32) + + roi = [tf.cast(y, tf.float32) / float_height, + tf.cast(x, tf.float32) / float_width, + tf.cast(y + sampled_height, tf.float32) / float_height, + tf.cast(x + sampled_width, tf.float32) / float_width] + + mask_min = tf.logical_and(tf.greater(center_y, roi[0]), tf.greater(center_x, roi[1])) + mask_max = tf.logical_and(tf.less(center_y, roi[2]), tf.less(center_x, roi[3])) + mask = tf.logical_and(mask_min, mask_max) + + return index + 1, roi, mask + + [index, roi, mask] = tf.while_loop(condition, body, [index, roi, mask], parallel_iterations=10, back_prop=False, swap_memory=True) + + mask_labels = tf.boolean_mask(labels, mask) + mask_bboxes = tf.boolean_mask(bboxes, mask) + + return roi, mask_labels, mask_bboxes + def check_roi_overlap(width, height, labels, bboxes, min_iou): + with tf.name_scope('check_roi_overlap'): + index = 0 + max_attempt = 50 + roi = [0., 0., 1., 1.] + mask_labels = labels + mask_bboxes = bboxes + + def condition(index, roi, mask_labels, mask_bboxes): + return tf.logical_or(tf.logical_or(tf.logical_and(tf.reduce_sum(tf.cast(jaccard_with_anchors(roi, mask_bboxes) < min_iou, tf.int32)) > 0, + tf.less(index, max_attempt)), + tf.less(index, 1)), + tf.less(tf.shape(mask_labels)[0], 1)) + + def body(index, roi, mask_labels, mask_bboxes): + roi, mask_labels, mask_bboxes = check_roi_center(width, height, labels, bboxes) + return index+1, roi, mask_labels, mask_bboxes + + [index, roi, mask_labels, mask_bboxes] = tf.while_loop(condition, body, [index, roi, mask_labels, mask_bboxes], parallel_iterations=16, back_prop=False, swap_memory=True) + + return tf.cond(tf.greater(tf.shape(mask_labels)[0], 0), + lambda : (tf.cast([roi[0] * tf.cast(height, tf.float32), + roi[1] * tf.cast(width, tf.float32), + (roi[2] - roi[0]) * tf.cast(height, tf.float32), + (roi[3] - roi[1]) * tf.cast(width, tf.float32)], tf.int32), mask_labels, mask_bboxes), + lambda : (tf.cast([0, 0, height, width], tf.int32), labels, bboxes)) + + + def sample_patch(image, labels, bboxes, min_iou): + with tf.name_scope('sample_patch'): + height, width, depth = _ImageDimensions(image, rank=3) + + roi_slice_range, mask_labels, mask_bboxes = check_roi_overlap(width, height, labels, bboxes, min_iou) + + scale = tf.cast(tf.stack([height, width, height, width]), mask_bboxes.dtype) + mask_bboxes = mask_bboxes * scale + + # Add offset. + offset = tf.cast(tf.stack([roi_slice_range[0], roi_slice_range[1], roi_slice_range[0], roi_slice_range[1]]), mask_bboxes.dtype) + mask_bboxes = mask_bboxes - offset + + cliped_ymin = tf.maximum(0., mask_bboxes[:, 0]) + cliped_xmin = tf.maximum(0., mask_bboxes[:, 1]) + cliped_ymax = tf.minimum(tf.cast(roi_slice_range[2], tf.float32), mask_bboxes[:, 2]) + cliped_xmax = tf.minimum(tf.cast(roi_slice_range[3], tf.float32), mask_bboxes[:, 3]) + + mask_bboxes = tf.stack([cliped_ymin, cliped_xmin, cliped_ymax, cliped_xmax], axis=-1) + # Rescale to target dimension. + scale = tf.cast(tf.stack([roi_slice_range[2], roi_slice_range[3], + roi_slice_range[2], roi_slice_range[3]]), mask_bboxes.dtype) + + return tf.cond(tf.logical_or(tf.less(roi_slice_range[2], 1), tf.less(roi_slice_range[3], 1)), + lambda: (image, labels, bboxes), + lambda: (tf.slice(image, [roi_slice_range[0], roi_slice_range[1], 0], [roi_slice_range[2], roi_slice_range[3], -1]), + mask_labels, mask_bboxes / scale)) + + with tf.name_scope('ssd_random_sample_patch'): + image = tf.convert_to_tensor(image, name='image') + + min_iou_list = tf.convert_to_tensor(ratio_list) + samples_min_iou = tf.multinomial(tf.log([[1. / len(ratio_list)] * len(ratio_list)]), 1) + + sampled_min_iou = min_iou_list[tf.cast(samples_min_iou[0][0], tf.int32)] + + return tf.cond(tf.less(sampled_min_iou, 1.), lambda: sample_patch(image, labels, bboxes, sampled_min_iou), lambda: (image, labels, bboxes)) + +def ssd_random_expand(image, bboxes, ratio=2., name=None): + with tf.name_scope('ssd_random_expand'): + image = tf.convert_to_tensor(image, name='image') + if image.get_shape().ndims != 3: + raise ValueError('\'image\' must have 3 dimensions.') + + height, width, depth = _ImageDimensions(image, rank=3) + + float_height, float_width = tf.to_float(height), tf.to_float(width) + + canvas_width, canvas_height = tf.to_int32(float_width * ratio), tf.to_int32(float_height * ratio) + + mean_color_of_image = [_R_MEAN/255., _G_MEAN/255., _B_MEAN/255.]#tf.reduce_mean(tf.reshape(image, [-1, 3]), 0) + + x = tf.random_uniform([], minval=0, maxval=canvas_width - width, dtype=tf.int32) + y = tf.random_uniform([], minval=0, maxval=canvas_height - height, dtype=tf.int32) + + paddings = tf.convert_to_tensor([[y, canvas_height - height - y], [x, canvas_width - width - x]]) + + big_canvas = tf.stack([tf.pad(image[:, :, 0], paddings, "CONSTANT", constant_values = mean_color_of_image[0]), + tf.pad(image[:, :, 1], paddings, "CONSTANT", constant_values = mean_color_of_image[1]), + tf.pad(image[:, :, 2], paddings, "CONSTANT", constant_values = mean_color_of_image[2])], axis=-1) + + scale = tf.cast(tf.stack([height, width, height, width]), bboxes.dtype) + absolute_bboxes = bboxes * scale + tf.cast(tf.stack([y, x, y, x]), bboxes.dtype) + + return big_canvas, absolute_bboxes / tf.cast(tf.stack([canvas_height, canvas_width, canvas_height, canvas_width]), bboxes.dtype) + +# def ssd_random_sample_patch_wrapper(image, labels, bboxes): +# with tf.name_scope('ssd_random_sample_patch_wrapper'): +# orgi_image, orgi_labels, orgi_bboxes = image, labels, bboxes +# def check_bboxes(bboxes): +# areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]) +# return tf.logical_and(tf.logical_and(areas < 0.9, areas > 0.001), +# tf.logical_and((bboxes[:, 3] - bboxes[:, 1]) > 0.025, (bboxes[:, 2] - bboxes[:, 0]) > 0.025)) + +# index = 0 +# max_attempt = 3 +# def condition(index, image, labels, bboxes): +# return tf.logical_or(tf.logical_and(tf.reduce_sum(tf.cast(check_bboxes(bboxes), tf.int64)) < 1, tf.less(index, max_attempt)), tf.less(index, 1)) + +# def body(index, image, labels, bboxes): +# image, bboxes = tf.cond(tf.random_uniform([], minval=0., maxval=1., dtype=tf.float32) < 0.5, +# lambda: (image, bboxes), +# lambda: ssd_random_expand(image, bboxes, tf.random_uniform([1], minval=1.1, maxval=4., dtype=tf.float32)[0])) +# # Distort image and bounding boxes. +# random_sample_image, labels, bboxes = ssd_random_sample_patch(image, labels, bboxes, ratio_list=[-0.1, 0.1, 0.3, 0.5, 0.7, 0.9, 1.]) +# random_sample_image.set_shape([None, None, 3]) +# return index+1, random_sample_image, labels, bboxes + +# [index, image, labels, bboxes] = tf.while_loop(condition, body, [index, orgi_image, orgi_labels, orgi_bboxes], parallel_iterations=4, back_prop=False, swap_memory=True) + +# valid_mask = check_bboxes(bboxes) +# labels, bboxes = tf.boolean_mask(labels, valid_mask), tf.boolean_mask(bboxes, valid_mask) +# return tf.cond(tf.less(index, max_attempt), +# lambda : (image, labels, bboxes), +# lambda : (orgi_image, orgi_labels, orgi_bboxes)) + +def ssd_random_sample_patch_wrapper(image, labels, bboxes): + with tf.name_scope('ssd_random_sample_patch_wrapper'): + orgi_image, orgi_labels, orgi_bboxes = image, labels, bboxes + def check_bboxes(bboxes): + areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]) + return tf.logical_and(tf.logical_and(areas < 0.9, areas > 0.001), + tf.logical_and((bboxes[:, 3] - bboxes[:, 1]) > 0.025, (bboxes[:, 2] - bboxes[:, 0]) > 0.025)) + + index = 0 + max_attempt = 3 + def condition(index, image, labels, bboxes, orgi_image, orgi_labels, orgi_bboxes): + return tf.logical_or(tf.logical_and(tf.reduce_sum(tf.cast(check_bboxes(bboxes), tf.int64)) < 1, tf.less(index, max_attempt)), tf.less(index, 1)) + + def body(index, image, labels, bboxes, orgi_image, orgi_labels, orgi_bboxes): + image, bboxes = tf.cond(tf.random_uniform([], minval=0., maxval=1., dtype=tf.float32) < 0.5, + lambda: (orgi_image, orgi_bboxes), + lambda: ssd_random_expand(orgi_image, orgi_bboxes, tf.random_uniform([1], minval=1.1, maxval=4., dtype=tf.float32)[0])) + # Distort image and bounding boxes. + random_sample_image, labels, bboxes = ssd_random_sample_patch(image, orgi_labels, bboxes, ratio_list=[-0.1, 0.1, 0.3, 0.5, 0.7, 0.9, 1.]) + random_sample_image.set_shape([None, None, 3]) + return index+1, random_sample_image, labels, bboxes, orgi_image, orgi_labels, orgi_bboxes + + [index, image, labels, bboxes, orgi_image, orgi_labels, orgi_bboxes] = tf.while_loop(condition, body, [index, image, labels, bboxes, orgi_image, orgi_labels, orgi_bboxes], parallel_iterations=4, back_prop=False, swap_memory=True) + + valid_mask = check_bboxes(bboxes) + labels, bboxes = tf.boolean_mask(labels, valid_mask), tf.boolean_mask(bboxes, valid_mask) + return tf.cond(tf.less(index, max_attempt), + lambda : (image, labels, bboxes), + lambda : (orgi_image, orgi_labels, orgi_bboxes)) + +def _mean_image_subtraction(image, means): + """Subtracts the given means from each image channel. + + For example: + means = [123.68, 116.779, 103.939] + image = _mean_image_subtraction(image, means) + + Note that the rank of `image` must be known. + + Args: + image: a tensor of size [height, width, C]. + means: a C-vector of values to subtract from each channel. + + Returns: + the centered image. + + Raises: + ValueError: If the rank of `image` is unknown, if `image` has a rank other + than three or if the number of channels in `image` doesn't match the + number of values in `means`. + """ + if image.get_shape().ndims != 3: + raise ValueError('Input must be of size [height, width, C>0]') + num_channels = image.get_shape().as_list()[-1] + if len(means) != num_channels: + raise ValueError('len(means) must match the number of channels') + + channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image) + for i in range(num_channels): + channels[i] -= means[i] + return tf.concat(axis=2, values=channels) + +def unwhiten_image(image): + means=[_R_MEAN, _G_MEAN, _B_MEAN] + num_channels = image.get_shape().as_list()[-1] + channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image) + for i in range(num_channels): + channels[i] += means[i] + return tf.concat(axis=2, values=channels) + +def random_flip_left_right(image, bboxes): + with tf.name_scope('random_flip_left_right'): + uniform_random = tf.random_uniform([], 0, 1.0) + mirror_cond = tf.less(uniform_random, .5) + # Flip image. + result = tf.cond(mirror_cond, lambda: tf.image.flip_left_right(image), lambda: image) + # Flip bboxes. + mirror_bboxes = tf.stack([bboxes[:, 0], 1 - bboxes[:, 3], + bboxes[:, 2], 1 - bboxes[:, 1]], axis=-1) + bboxes = tf.cond(mirror_cond, lambda: mirror_bboxes, lambda: bboxes) + return result, bboxes + +def preprocess_for_train(image, labels, bboxes, out_shape, data_format='channels_first', scope='ssd_preprocessing_train', output_rgb=True): + """Preprocesses the given image for training. + + Args: + image: A `Tensor` representing an image of arbitrary size. + labels: A `Tensor` containing all labels for all bboxes of this image. + bboxes: A `Tensor` containing all bboxes of this image, in range [0., 1.] with shape [num_bboxes, 4]. + out_shape: The height and width of the image after preprocessing. + data_format: The data_format of the desired output image. + Returns: + A preprocessed image. + """ + with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]): + if image.get_shape().ndims != 3: + raise ValueError('Input must be of size [height, width, C>0]') + # Convert to float scaled [0, 1]. + orig_dtype = image.dtype + if orig_dtype != tf.float32: + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + + # Randomly distort the colors. There are 4 ways to do it. + distort_image = apply_with_random_selector(image, + lambda x, ordering: distort_color(x, ordering, True), + num_cases=4) + + random_sample_image, labels, bboxes = ssd_random_sample_patch_wrapper(distort_image, labels, bboxes) + # image, bboxes = tf.cond(tf.random_uniform([1], minval=0., maxval=1., dtype=tf.float32)[0] < 0.25, + # lambda: (image, bboxes), + # lambda: ssd_random_expand(image, bboxes, tf.random_uniform([1], minval=2, maxval=4, dtype=tf.int32)[0])) + + # # Distort image and bounding boxes. + # random_sample_image, labels, bboxes = ssd_random_sample_patch(image, labels, bboxes, ratio_list=[0.1, 0.3, 0.5, 0.7, 0.9, 1.]) + + # Randomly flip the image horizontally. + random_sample_flip_image, bboxes = random_flip_left_right(random_sample_image, bboxes) + # Rescale to VGG input scale. + random_sample_flip_resized_image = tf.image.resize_images(random_sample_flip_image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) + random_sample_flip_resized_image.set_shape([None, None, 3]) + + final_image = tf.to_float(tf.image.convert_image_dtype(random_sample_flip_resized_image, orig_dtype, saturate=True)) + final_image = _mean_image_subtraction(final_image, [_R_MEAN, _G_MEAN, _B_MEAN]) + + final_image.set_shape(out_shape + [3]) + if not output_rgb: + image_channels = tf.unstack(final_image, axis=-1, name='split_rgb') + final_image = tf.stack([image_channels[2], image_channels[1], image_channels[0]], axis=-1, name='merge_bgr') + if data_format == 'channels_first': + final_image = tf.transpose(final_image, perm=(2, 0, 1)) + return final_image, labels, bboxes + +def preprocess_for_eval(image, out_shape, data_format='channels_first', scope='ssd_preprocessing_eval', output_rgb=True): + """Preprocesses the given image for evaluation. + + Args: + image: A `Tensor` representing an image of arbitrary size. + out_shape: The height and width of the image after preprocessing. + data_format: The data_format of the desired output image. + Returns: + A preprocessed image. + """ + with tf.name_scope(scope, 'ssd_preprocessing_eval', [image]): + image = tf.to_float(image) + image = tf.image.resize_images(image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) + image.set_shape(out_shape + [3]) + + image = _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]) + if not output_rgb: + image_channels = tf.unstack(image, axis=-1, name='split_rgb') + image = tf.stack([image_channels[2], image_channels[1], image_channels[0]], axis=-1, name='merge_bgr') + # Image data format. + if data_format == 'channels_first': + image = tf.transpose(image, perm=(2, 0, 1)) + return image + +def preprocess_image(image, labels, bboxes, out_shape, is_training=False, data_format='channels_first', output_rgb=True): + """Preprocesses the given image. + + Args: + image: A `Tensor` representing an image of arbitrary size. + labels: A `Tensor` containing all labels for all bboxes of this image. + bboxes: A `Tensor` containing all bboxes of this image, in range [0., 1.] with shape [num_bboxes, 4]. + out_shape: The height and width of the image after preprocessing. + is_training: Wether we are in training phase. + data_format: The data_format of the desired output image. + + Returns: + A preprocessed image. + """ + if is_training: + return preprocess_for_train(image, labels, bboxes, out_shape, data_format=data_format, output_rgb=output_rgb) + else: + return preprocess_for_eval(image, out_shape, data_format=data_format, output_rgb=output_rgb) diff --git a/cv/detection/ssd/tensorflow/readme_origin.md b/cv/detection/ssd/tensorflow/readme_origin.md new file mode 100644 index 0000000000000000000000000000000000000000..f2b3a20a623d1e3f80373651868edb350eadb4d4 --- /dev/null +++ b/cv/detection/ssd/tensorflow/readme_origin.md @@ -0,0 +1,138 @@ +# State-of-the-art Single Shot MultiBox Detector in TensorFlow + +This repository contains codes of the reimplementation of [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325) in TensorFlow. If your goal is to reproduce the results in the original paper, please use the official [codes](https://github.com/weiliu89/caffe/tree/ssd). + +There are already some TensorFlow based SSD reimplementation codes on GitHub, the main special features of this repo inlcude: + +- state of the art performance(77.8%mAP) when training from VGG-16 pre-trained model (SSD300-VGG16). +- the model is trained using TensorFlow high level API [tf.estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator). Although TensorFlow provides many APIs, the Estimator API is highly recommended to yield scalable, high-performance models. +- all codes were writen by pure TensorFlow ops (no numpy operation) to ensure the performance and portability. +- using ssd augmentation pipeline discribed in the original paper. +- PyTorch-like model definition using high-level [tf.layers](https://www.tensorflow.org/api_docs/python/tf/layers) API for better readability ^-^. +- high degree of modularity to ease futher development. +- using replicate\_model\_fn makes it flexible to use one or more GPUs. + +***New Update(77.9%mAP): using absolute bbox coordinates instead of normalized coordinates, checkout [here](https://github.com/HiKapok/SSD.TensorFlow/tree/AbsoluteCoord).*** + +## ## +## Usage +- Download [Pascal VOC Dataset](https://pjreddie.com/projects/pascal-voc-dataset-mirror/) and reorganize the directory as follows: + ``` + VOCROOT/ + |->VOC2007/ + | |->Annotations/ + | |->ImageSets/ + | |->... + |->VOC2012/ + | |->Annotations/ + | |->ImageSets/ + | |->... + |->VOC2007TEST/ + | |->Annotations/ + | |->... + ``` + VOCROOT is your path of the Pascal VOC Dataset. +- Run the following script to generate TFRecords. + ```sh + python dataset/convert_tfrecords.py --dataset_directory=VOCROOT --output_directory=./dataset/tfrecords + ``` +- Download the **pre-trained VGG-16 model (reduced-fc)** from [here](https://drive.google.com/drive/folders/184srhbt8_uvLKeWW_Yo8Mc5wTyc0lJT7) and put them into one sub-directory named 'model' (we support SaverDef.V2 by default, the V1 version is also available for sake of compatibility). +- Run the following script to start training: + + ```sh + python train_ssd.py + ``` +- Run the following script for evaluation and get mAP: + + ```sh + python eval_ssd.py + python voc_eval.py + ``` + Note: you need first modify some directory in voc_eval.py. +- Run the following script for visualization: + ```sh + python simple_ssd_demo.py + ``` + +All the codes was tested under TensorFlow 1.6, Python 3.5, Ubuntu 16.04 with CUDA 8.0. If you want to run training by yourself, one decent GPU will be highly recommended. The whole training process for VOC07+12 dataset took ~120k steps in total, and each step (32 samples per-batch) took ~1s on my little workstation with single GTX1080-Ti GPU Card. If you need run training without enough GPU memory you can try half of the current batch size(e.g. 16), try to lower the learning rate and run more steps, watching the TensorBoard until convergency. BTW, the codes here had also been tested under TensorFlow 1.4 with CUDA 8.0, but some modifications to the codes are needed to enable replicate model training, take following steps if you need: + +- copy all the codes of [this file](https://github.com/tensorflow/tensorflow/blob/v1.6.0/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py) to your local file named 'tf\_replicate\_model\_fn.py' +- add one more line [here](https://github.com/HiKapok/SSD.TensorFlow/blob/899e08dad48669ca0c444284977e3d7ffa1da5fe/train_ssd.py#L25) to import module 'tf\_replicate\_model\_fn' +- change 'tf.contrib.estimator' in [here](https://github.com/HiKapok/SSD.TensorFlow/blob/899e08dad48669ca0c444284977e3d7ffa1da5fe/train_ssd.py#L383) and [here](https://github.com/HiKapok/SSD.TensorFlow/blob/899e08dad48669ca0c444284977e3d7ffa1da5fe/train_ssd.py#L422) to 'tf\_replicate\_model\_fn' +- now the training process should run perfectly +- before you run 'eval_ssd.py', you should also remove [this line](https://github.com/HiKapok/SSD.TensorFlow/blob/e8296848b9f6eb585da5945d6b3ae099029ef4bf/eval_ssd.py#L369) because of the interface compatibility + + +***This repo is just created recently, any contribution will be welcomed.*** + +## Results (VOC07 Metric) + +This implementation(SSD300-VGG16) yield **mAP 77.8%** on PASCAL VOC 2007 test dataset(the original performance described in the paper is 77.2%mAP), the details are as follows: + +| sofa | bird | pottedplant | bus | diningtable | cow | bottle | horse | aeroplane | motorbike +|:-------|:-----:|:-------:|:-------:|:-------:|:-------:|:-------:|:-------:|:-------:|:-------:| +| 78.9 | 76.2 | 53.5 | 85.2 | 75.5 | 85.0 | 48.6 | 86.7 | 82.2 | 83.4 | +| **sheep** | **train** | **boat** | **bicycle** | **chair** | **cat** | **tvmonitor** | **person** | **car** | **dog** | +| 82.4 | 87.6 | 72.7 | 83.0 | 61.3 | 88.2 | 74.5 | 79.6 | 85.3 | 86.4 | + +You can download the trained model(VOC07+12 Train) from [GoogleDrive](https://drive.google.com/open?id=1yeYcfcOURcZ4DaElEn9C2xY1NymGzG5W) for further research. + +For Chinese friends, you can also download both the trained model and pre-trained vgg16 weights from [BaiduYun Drive](https://pan.baidu.com/s/1kRhZd4p-N46JFpVkMgU3fg), access code: **tg64**. + +Here is the training logs and some detection results: + +![](logs/loss.JPG "loss") +![](logs/celoss.JPG "celoss") +![](logs/locloss.JPG "locloss") +![](demo/demo1.jpg "demo1") +![](demo/demo2.jpg "demo2") +![](demo/demo3.jpg "demo3") + +## *Too Busy* TODO + +- Adapting for CoCo Dataset +- Update version SSD-512 +- Transfer to other backbone networks + +## Known Issues + +- Got 'TypeError: Expected binary or unicode string, got None' while training + - Why: There maybe some inconsistent between different TensorFlow version. + - How: If you got this error, try change the default value of checkpoint_path to './model/vgg16.ckpt' in [train_ssd.py](https://github.com/HiKapok/SSD.TensorFlow/blob/86e3fa600d8d07122e9366ae664dea8c3c87c622/train_ssd.py#L107). For more information [issue6](https://github.com/HiKapok/SSD.TensorFlow/issues/6) and [issue9](https://github.com/HiKapok/SSD.TensorFlow/issues/9). +- Nan loss during training + - Why: This is caused by the default learning rate which is a little higher for some TensorFlow version. + - How: I don't know the details about the different behavior between different versions. There are two workarounds: + - Adding warm-up: change some codes [here](https://github.com/HiKapok/SSD.TensorFlow/blob/d9cf250df81c8af29985c03d76636b2b8b19f089/train_ssd.py#L99) to the following snippet: + + ```python + tf.app.flags.DEFINE_string( + 'decay_boundaries', '2000, 80000, 100000', + 'Learning rate decay boundaries by global_step (comma-separated list).') + tf.app.flags.DEFINE_string( + 'lr_decay_factors', '0.1, 1, 0.1, 0.01', + 'The values of learning_rate decay factor for each segment between boundaries (comma-separated list).') + ``` + - Lower the learning rate and run more steps until convergency. +- Why this re-implementation perform better than the reported performance + - I don't know + +## Citation + +Use this bibtex to cite this repository: +``` +@misc{kapok_ssd_2018, + title={Single Shot MultiBox Detector in TensorFlow}, + author={Changan Wang}, + year={2018}, + publisher={Github}, + journal={GitHub repository}, + howpublished={\url{https://github.com/HiKapok/SSD.TensorFlow}}, +} +``` + +## Discussion + +Welcome to join in QQ Group(758790869) for more discussion + +## ## +Apache License, Version 2.0 diff --git a/cv/detection/ssd/tensorflow/simple_ssd_demo.py b/cv/detection/ssd/tensorflow/simple_ssd_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..67540bc148e005f2e99268310957e7609cb6d1c7 --- /dev/null +++ b/cv/detection/ssd/tensorflow/simple_ssd_demo.py @@ -0,0 +1,220 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +import tensorflow as tf +from scipy.misc import imread, imsave, imshow, imresize +import numpy as np + +from net import ssd_net + +from dataset import dataset_common +from preprocessing import ssd_preprocessing +from utility import anchor_manipulator +from utility import draw_toolbox + +# scaffold related configuration +tf.app.flags.DEFINE_integer( + 'num_classes', 21, 'Number of classes to use in the dataset.') +# model related configuration +tf.app.flags.DEFINE_integer( + 'train_image_size', 300, + 'The size of the input image for the model to use.') +tf.app.flags.DEFINE_string( + 'data_format', 'channels_last', # 'channels_first' or 'channels_last' + 'A flag to override the data format used in the model. channels_first ' + 'provides a performance boost on GPU but is not always compatible ' + 'with CPU. If left unspecified, the data format will be chosen ' + 'automatically based on whether TensorFlow was built for CPU or GPU.') +tf.app.flags.DEFINE_float( + 'select_threshold', 0.2, 'Class-specific confidence score threshold for selecting a box.') +tf.app.flags.DEFINE_float( + 'min_size', 0.03, 'The min size of bboxes to keep.') +tf.app.flags.DEFINE_float( + 'nms_threshold', 0.45, 'Matching threshold in NMS algorithm.') +tf.app.flags.DEFINE_integer( + 'nms_topk', 20, 'Number of total object to keep after NMS.') +tf.app.flags.DEFINE_integer( + 'keep_topk', 200, 'Number of total object to keep for each image before nms.') +# checkpoint related configuration +tf.app.flags.DEFINE_string( + 'checkpoint_path', './logs', + 'The path to a checkpoint from which to fine-tune.') +tf.app.flags.DEFINE_string( + 'model_scope', 'ssd300', + 'Model scope name used to replace the name_scope in checkpoint.') + +FLAGS = tf.app.flags.FLAGS +#CUDA_VISIBLE_DEVICES + +def get_checkpoint(): + if tf.gfile.IsDirectory(FLAGS.checkpoint_path): + checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) + else: + checkpoint_path = FLAGS.checkpoint_path + + return checkpoint_path + +def select_bboxes(scores_pred, bboxes_pred, num_classes, select_threshold): + selected_bboxes = {} + selected_scores = {} + with tf.name_scope('select_bboxes', [scores_pred, bboxes_pred]): + for class_ind in range(1, num_classes): + class_scores = scores_pred[:, class_ind] + + select_mask = class_scores > select_threshold + select_mask = tf.cast(select_mask, tf.float32) + selected_bboxes[class_ind] = tf.multiply(bboxes_pred, tf.expand_dims(select_mask, axis=-1)) + selected_scores[class_ind] = tf.multiply(class_scores, select_mask) + + return selected_bboxes, selected_scores + +def clip_bboxes(ymin, xmin, ymax, xmax, name): + with tf.name_scope(name, 'clip_bboxes', [ymin, xmin, ymax, xmax]): + ymin = tf.maximum(ymin, 0.) + xmin = tf.maximum(xmin, 0.) + ymax = tf.minimum(ymax, 1.) + xmax = tf.minimum(xmax, 1.) + + ymin = tf.minimum(ymin, ymax) + xmin = tf.minimum(xmin, xmax) + + return ymin, xmin, ymax, xmax + +def filter_bboxes(scores_pred, ymin, xmin, ymax, xmax, min_size, name): + with tf.name_scope(name, 'filter_bboxes', [scores_pred, ymin, xmin, ymax, xmax]): + width = xmax - xmin + height = ymax - ymin + + filter_mask = tf.logical_and(width > min_size, height > min_size) + + filter_mask = tf.cast(filter_mask, tf.float32) + return tf.multiply(ymin, filter_mask), tf.multiply(xmin, filter_mask), \ + tf.multiply(ymax, filter_mask), tf.multiply(xmax, filter_mask), tf.multiply(scores_pred, filter_mask) + +def sort_bboxes(scores_pred, ymin, xmin, ymax, xmax, keep_topk, name): + with tf.name_scope(name, 'sort_bboxes', [scores_pred, ymin, xmin, ymax, xmax]): + cur_bboxes = tf.shape(scores_pred)[0] + scores, idxes = tf.nn.top_k(scores_pred, k=tf.minimum(keep_topk, cur_bboxes), sorted=True) + + ymin, xmin, ymax, xmax = tf.gather(ymin, idxes), tf.gather(xmin, idxes), tf.gather(ymax, idxes), tf.gather(xmax, idxes) + + paddings_scores = tf.expand_dims(tf.stack([0, tf.maximum(keep_topk-cur_bboxes, 0)], axis=0), axis=0) + + return tf.pad(ymin, paddings_scores, "CONSTANT"), tf.pad(xmin, paddings_scores, "CONSTANT"),\ + tf.pad(ymax, paddings_scores, "CONSTANT"), tf.pad(xmax, paddings_scores, "CONSTANT"),\ + tf.pad(scores, paddings_scores, "CONSTANT") + +def nms_bboxes(scores_pred, bboxes_pred, nms_topk, nms_threshold, name): + with tf.name_scope(name, 'nms_bboxes', [scores_pred, bboxes_pred]): + idxes = tf.image.non_max_suppression(bboxes_pred, scores_pred, nms_topk, nms_threshold) + return tf.gather(scores_pred, idxes), tf.gather(bboxes_pred, idxes) + +def parse_by_class(cls_pred, bboxes_pred, num_classes, select_threshold, min_size, keep_topk, nms_topk, nms_threshold): + with tf.name_scope('select_bboxes', [cls_pred, bboxes_pred]): + scores_pred = tf.nn.softmax(cls_pred) + selected_bboxes, selected_scores = select_bboxes(scores_pred, bboxes_pred, num_classes, select_threshold) + for class_ind in range(1, num_classes): + ymin, xmin, ymax, xmax = tf.unstack(selected_bboxes[class_ind], 4, axis=-1) + #ymin, xmin, ymax, xmax = tf.squeeze(ymin), tf.squeeze(xmin), tf.squeeze(ymax), tf.squeeze(xmax) + ymin, xmin, ymax, xmax = clip_bboxes(ymin, xmin, ymax, xmax, 'clip_bboxes_{}'.format(class_ind)) + ymin, xmin, ymax, xmax, selected_scores[class_ind] = filter_bboxes(selected_scores[class_ind], + ymin, xmin, ymax, xmax, min_size, 'filter_bboxes_{}'.format(class_ind)) + ymin, xmin, ymax, xmax, selected_scores[class_ind] = sort_bboxes(selected_scores[class_ind], + ymin, xmin, ymax, xmax, keep_topk, 'sort_bboxes_{}'.format(class_ind)) + selected_bboxes[class_ind] = tf.stack([ymin, xmin, ymax, xmax], axis=-1) + selected_scores[class_ind], selected_bboxes[class_ind] = nms_bboxes(selected_scores[class_ind], selected_bboxes[class_ind], nms_topk, nms_threshold, 'nms_bboxes_{}'.format(class_ind)) + + return selected_bboxes, selected_scores + +def main(_): + with tf.Graph().as_default(): + out_shape = [FLAGS.train_image_size] * 2 + + image_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) + shape_input = tf.placeholder(tf.int32, shape=(2,)) + + features = ssd_preprocessing.preprocess_for_eval(image_input, out_shape, data_format=FLAGS.data_format, output_rgb=False) + features = tf.expand_dims(features, axis=0) + + anchor_creator = anchor_manipulator.AnchorCreator(out_shape, + layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], + anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], + extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], + anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], + #anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], + layer_steps = [8, 16, 32, 64, 100, 300]) + all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() + + anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6, + positive_threshold = None, + ignore_threshold = None, + prior_scaling=[0.1, 0.1, 0.2, 0.2]) + + decode_fn = lambda pred : anchor_encoder_decoder.ext_decode_all_anchors(pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) + + with tf.variable_scope(FLAGS.model_scope, default_name=None, values=[features], reuse=tf.AUTO_REUSE): + backbone = ssd_net.VGG16Backbone(FLAGS.data_format) + feature_layers = backbone.forward(features, training=False) + location_pred, cls_pred = ssd_net.multibox_head(feature_layers, FLAGS.num_classes, all_num_anchors_depth, data_format=FLAGS.data_format) + if FLAGS.data_format == 'channels_first': + cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] + location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred] + + cls_pred = [tf.reshape(pred, [-1, FLAGS.num_classes]) for pred in cls_pred] + location_pred = [tf.reshape(pred, [-1, 4]) for pred in location_pred] + + cls_pred = tf.concat(cls_pred, axis=0) + location_pred = tf.concat(location_pred, axis=0) + + with tf.device('/cpu:0'): + bboxes_pred = decode_fn(location_pred) + bboxes_pred = tf.concat(bboxes_pred, axis=0) + selected_bboxes, selected_scores = parse_by_class(cls_pred, bboxes_pred, + FLAGS.num_classes, FLAGS.select_threshold, FLAGS.min_size, + FLAGS.keep_topk, FLAGS.nms_topk, FLAGS.nms_threshold) + + labels_list = [] + scores_list = [] + bboxes_list = [] + for k, v in selected_scores.items(): + labels_list.append(tf.ones_like(v, tf.int32) * k) + scores_list.append(v) + bboxes_list.append(selected_bboxes[k]) + all_labels = tf.concat(labels_list, axis=0) + all_scores = tf.concat(scores_list, axis=0) + all_bboxes = tf.concat(bboxes_list, axis=0) + + saver = tf.train.Saver() + with tf.Session() as sess: + init = tf.global_variables_initializer() + sess.run(init) + + saver.restore(sess, get_checkpoint()) + + np_image = imread('./demo/test.jpg') + labels_, scores_, bboxes_ = sess.run([all_labels, all_scores, all_bboxes], feed_dict = {image_input : np_image, shape_input : np_image.shape[:-1]}) + + img_to_draw = draw_toolbox.bboxes_draw_on_img(np_image, labels_, scores_, bboxes_, thickness=2) + imsave('./demo/test_out.jpg', img_to_draw) + +if __name__ == '__main__': + tf.logging.set_verbosity(tf.logging.INFO) + tf.app.run() diff --git a/cv/detection/ssd/tensorflow/train_ssd.py b/cv/detection/ssd/tensorflow/train_ssd.py new file mode 100644 index 0000000000000000000000000000000000000000..4df6c21772044fa4d605de8ab49cf108e0ede872 --- /dev/null +++ b/cv/detection/ssd/tensorflow/train_ssd.py @@ -0,0 +1,518 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +import tensorflow.compat.v1 as tf +tf.disable_v2_behavior() + +from net import ssd_net + +from dataset import dataset_common +from preprocessing import ssd_preprocessing +from utility import anchor_manipulator +from utility import scaffolds + +# hardware related configuration +tf.app.flags.DEFINE_integer( + 'num_readers', 8, + 'The number of parallel readers that read data from the dataset.') +tf.app.flags.DEFINE_integer( + 'num_preprocessing_threads', 24, + 'The number of threads used to create the batches.') +tf.app.flags.DEFINE_integer( + 'num_cpu_threads', 0, + 'The number of cpu cores used to train.') +tf.app.flags.DEFINE_float( + 'gpu_memory_fraction', 1., 'GPU memory fraction to use.') +# scaffold related configuration +tf.app.flags.DEFINE_string( + 'data_dir', './dataset/tfrecords', + 'The directory where the dataset input data is stored.') +tf.app.flags.DEFINE_integer( + 'num_classes', 21, 'Number of classes to use in the dataset.') +tf.app.flags.DEFINE_string( + 'model_dir', './logs/', + 'The directory where the model will be stored.') +tf.app.flags.DEFINE_integer( + 'log_every_n_steps', 10, + 'The frequency with which logs are printed.') +tf.app.flags.DEFINE_integer( + 'save_summary_steps', 500, + 'The frequency with which summaries are saved, in seconds.') +tf.app.flags.DEFINE_integer( + 'save_checkpoints_secs', 7200, + 'The frequency with which the model is saved, in seconds.') +# model related configuration +tf.app.flags.DEFINE_integer( + 'train_image_size', 300, + 'The size of the input image for the model to use.') +tf.app.flags.DEFINE_integer( + 'train_epochs', 5, + 'The number of epochs to use for training.') +tf.app.flags.DEFINE_integer( + 'max_number_of_steps', 120000, + 'The max number of steps to use for training.') +tf.app.flags.DEFINE_integer( + 'batch_size', 32, + 'Batch size for training and evaluation.') +tf.app.flags.DEFINE_string( + 'data_format', 'channels_first', # 'channels_first' or 'channels_last' + 'A flag to override the data format used in the model. channels_first ' + 'provides a performance boost on GPU but is not always compatible ' + 'with CPU. If left unspecified, the data format will be chosen ' + 'automatically based on whether TensorFlow was built for CPU or GPU.') +tf.app.flags.DEFINE_float( + 'negative_ratio', 3., 'Negative ratio in the loss function.') +tf.app.flags.DEFINE_float( + 'match_threshold', 0.5, 'Matching threshold in the loss function.') +tf.app.flags.DEFINE_float( + 'neg_threshold', 0.5, 'Matching threshold for the negtive examples in the loss function.') +# optimizer related configuration +tf.app.flags.DEFINE_integer( + 'tf_random_seed', 20180503, 'Random seed for TensorFlow initializers.') +tf.app.flags.DEFINE_float( + 'weight_decay', 5e-4, 'The weight decay on the model weights.') +tf.app.flags.DEFINE_float( + 'momentum', 0.9, + 'The momentum for the MomentumOptimizer and RMSPropOptimizer.') +tf.app.flags.DEFINE_float('learning_rate', 1e-3, 'Initial learning rate.') +tf.app.flags.DEFINE_float( + 'end_learning_rate', 0.000001, + 'The minimal end learning rate used by a polynomial decay learning rate.') +# for learning rate piecewise_constant decay +tf.app.flags.DEFINE_string( + 'decay_boundaries', '500, 80000, 100000', + 'Learning rate decay boundaries by global_step (comma-separated list).') +tf.app.flags.DEFINE_string( + 'lr_decay_factors', '0.1, 1, 0.1, 0.01', + 'The values of learning_rate decay factor for each segment between boundaries (comma-separated list).') +# checkpoint related configuration +tf.app.flags.DEFINE_string( + 'checkpoint_path', './model', + 'The path to a checkpoint from which to fine-tune.') +tf.app.flags.DEFINE_string( + 'checkpoint_model_scope', 'vgg_16', + 'Model scope in the checkpoint. None if the same as the trained model.') +tf.app.flags.DEFINE_string( + 'model_scope', 'ssd300', + 'Model scope name used to replace the name_scope in checkpoint.') +tf.app.flags.DEFINE_string( + 'checkpoint_exclude_scopes', 'ssd300/multibox_head, ssd300/additional_layers, ssd300/conv4_3_scale', + 'Comma-separated list of scopes of variables to exclude when restoring from a checkpoint.') +tf.app.flags.DEFINE_boolean( + 'ignore_missing_vars', True, + 'When restoring a checkpoint would ignore missing variables.') +tf.app.flags.DEFINE_boolean( + 'multi_gpu', True, + 'Whether there is GPU to use for training.') +tf.app.flags.DEFINE_boolean( + 'use_amp', False, + 'Whether to use amp for training.') +tf.app.flags.DEFINE_string( + 'backbone', 'vgg16', + 'The backbone for feature extraction: vgg16/resnet18/resnet34/resnet50/resnet101.') + +FLAGS = tf.app.flags.FLAGS +#CUDA_VISIBLE_DEVICES +def validate_batch_size_for_multi_gpu(batch_size): + """For multi-gpu, batch-size must be a multiple of the number of + available GPUs. + + Note that this should eventually be handled by replicate_model_fn + directly. Multi-GPU support is currently experimental, however, + so doing the work here until that feature is in place. + """ + if FLAGS.multi_gpu: + from tensorflow.python.client import device_lib + + local_device_protos = device_lib.list_local_devices() + num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU']) + if not num_gpus: + raise ValueError('Multi-GPU mode was specified, but no GPUs ' + 'were found. To use CPU, run --multi_gpu=False.') + + remainder = batch_size % num_gpus + if remainder: + err = ('When running with multiple GPUs, batch size ' + 'must be a multiple of the number of available GPUs. ' + 'Found {} GPUs with a batch size of {}; try --batch_size={} instead.' + ).format(num_gpus, batch_size, batch_size - remainder) + raise ValueError(err) + return num_gpus + return 0 + +def get_init_fn(): + return scaffolds.get_init_fn_for_scaffold(FLAGS.model_dir, FLAGS.checkpoint_path, + FLAGS.model_scope, FLAGS.checkpoint_model_scope, + FLAGS.checkpoint_exclude_scopes, FLAGS.ignore_missing_vars, + name_remap={'/kernel': '/weights', '/bias': '/biases'}) + +# couldn't find better way to pass params from input_fn to model_fn +# some tensors used by model_fn must be created in input_fn to ensure they are in the same graph +# but when we put these tensors to labels's dict, the replicate_model_fn will split them into each GPU +# the problem is that they shouldn't be splited +global_anchor_info = dict() + +def input_pipeline(dataset_pattern='train-*', is_training=True, batch_size=FLAGS.batch_size): + def input_fn(): + out_shape = [FLAGS.train_image_size] * 2 + anchor_creator = anchor_manipulator.AnchorCreator(out_shape, + layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], + anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], + extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], + anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)], + layer_steps = [8, 16, 32, 64, 100, 300]) + all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() + + num_anchors_per_layer = [] + for ind in range(len(all_anchors)): + num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) + + anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6, + positive_threshold = FLAGS.match_threshold, + ignore_threshold = FLAGS.neg_threshold, + prior_scaling=[0.1, 0.1, 0.2, 0.2]) + + image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False) + anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial) + + image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes, + batch_size, + ('train' if is_training else 'val'), + os.path.join(FLAGS.data_dir, dataset_pattern), + FLAGS.num_readers, + FLAGS.num_preprocessing_threads, + image_preprocessing_fn, + anchor_encoder_fn, + num_epochs=FLAGS.train_epochs, + is_training=is_training) + global global_anchor_info + global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer), + 'num_anchors_per_layer': num_anchors_per_layer, + 'all_num_anchors_depth': all_num_anchors_depth } + + return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores} + return input_fn + +def modified_smooth_l1(bbox_pred, bbox_targets, bbox_inside_weights=1., bbox_outside_weights=1., sigma=1.): + """ + ResultLoss = outside_weights * SmoothL1(inside_weights * (bbox_pred - bbox_targets)) + SmoothL1(x) = 0.5 * (sigma * x)^2, if |x| < 1 / sigma^2 + |x| - 0.5 / sigma^2, otherwise + """ + with tf.name_scope('smooth_l1', [bbox_pred, bbox_targets]): + sigma2 = sigma * sigma + + inside_mul = tf.multiply(bbox_inside_weights, tf.subtract(bbox_pred, bbox_targets)) + + smooth_l1_sign = tf.cast(tf.less(tf.abs(inside_mul), 1.0 / sigma2), tf.float32) + smooth_l1_option1 = tf.multiply(tf.multiply(inside_mul, inside_mul), 0.5 * sigma2) + smooth_l1_option2 = tf.subtract(tf.abs(inside_mul), 0.5 / sigma2) + smooth_l1_result = tf.add(tf.multiply(smooth_l1_option1, smooth_l1_sign), + tf.multiply(smooth_l1_option2, tf.abs(tf.subtract(smooth_l1_sign, 1.0)))) + + outside_mul = tf.multiply(bbox_outside_weights, smooth_l1_result) + + return outside_mul + + +# from scipy.misc import imread, imsave, imshow, imresize +# import numpy as np +# from utility import draw_toolbox + +# def save_image_with_bbox(image, labels_, scores_, bboxes_): +# if not hasattr(save_image_with_bbox, "counter"): +# save_image_with_bbox.counter = 0 # it doesn't exist yet, so initialize it +# save_image_with_bbox.counter += 1 + +# img_to_draw = np.copy(image) + +# img_to_draw = draw_toolbox.bboxes_draw_on_img(img_to_draw, labels_, scores_, bboxes_, thickness=2) +# imsave(os.path.join('./debug/{}.jpg').format(save_image_with_bbox.counter), img_to_draw) +# return save_image_with_bbox.counter + +def ssd_model_fn(features, labels, mode, params): + """model_fn for SSD to be used with our Estimator.""" + shape = labels['shape'] + loc_targets = labels['loc_targets'] + cls_targets = labels['cls_targets'] + match_scores = labels['match_scores'] + + global global_anchor_info + decode_fn = global_anchor_info['decode_fn'] + num_anchors_per_layer = global_anchor_info['num_anchors_per_layer'] + all_num_anchors_depth = global_anchor_info['all_num_anchors_depth'] + + # bboxes_pred = decode_fn(loc_targets[0]) + # bboxes_pred = [tf.reshape(preds, [-1, 4]) for preds in bboxes_pred] + # bboxes_pred = tf.concat(bboxes_pred, axis=0) + # save_image_op = tf.py_func(save_image_with_bbox, + # [ssd_preprocessing.unwhiten_image(features[0]), + # tf.clip_by_value(cls_targets[0], 0, tf.int64.max), + # match_scores[0], + # bboxes_pred], + # tf.int64, stateful=True) + # with tf.control_dependencies([save_image_op]): + + #print(all_num_anchors_depth) + with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): + ssd_backbone = ssd_net.SSDBackbone( + FLAGS.backbone, + training=(mode == tf.estimator.ModeKeys.TRAIN), + data_format=params['data_format']) + feature_layers = ssd_backbone.forward(features) + #print(feature_layers) + location_pred, cls_pred = ssd_net.multibox_head(feature_layers, params['num_classes'], all_num_anchors_depth, data_format=params['data_format']) + + if params['data_format'] == 'channels_first': + cls_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in cls_pred] + location_pred = [tf.transpose(pred, [0, 2, 3, 1]) for pred in location_pred] + + cls_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, params['num_classes']]) for pred in cls_pred] + location_pred = [tf.reshape(pred, [tf.shape(features)[0], -1, 4]) for pred in location_pred] + + cls_pred = tf.concat(cls_pred, axis=1) + location_pred = tf.concat(location_pred, axis=1) + + cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) + location_pred = tf.reshape(location_pred, [-1, 4]) + + with tf.device('/cpu:0'): + with tf.control_dependencies([cls_pred, location_pred]): + with tf.name_scope('post_forward'): + #bboxes_pred = decode_fn(location_pred) + bboxes_pred = tf.map_fn(lambda _preds : decode_fn(_preds), + tf.reshape(location_pred, [tf.shape(features)[0], -1, 4]), + dtype=[tf.float32] * len(num_anchors_per_layer), back_prop=False) + #cls_targets = tf.Print(cls_targets, [tf.shape(bboxes_pred[0]),tf.shape(bboxes_pred[1]),tf.shape(bboxes_pred[2]),tf.shape(bboxes_pred[3])]) + bboxes_pred = [tf.reshape(preds, [-1, 4]) for preds in bboxes_pred] + bboxes_pred = tf.concat(bboxes_pred, axis=0) + + flaten_cls_targets = tf.reshape(cls_targets, [-1]) + flaten_match_scores = tf.reshape(match_scores, [-1]) + flaten_loc_targets = tf.reshape(loc_targets, [-1, 4]) + + # each positive examples has one label + positive_mask = flaten_cls_targets > 0 + n_positives = tf.count_nonzero(positive_mask) + + batch_n_positives = tf.count_nonzero(cls_targets, -1) + + batch_negtive_mask = tf.equal(cls_targets, 0)#tf.logical_and(tf.equal(cls_targets, 0), match_scores > 0.) + batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1) + + batch_n_neg_select = tf.cast(params['negative_ratio'] * tf.cast(batch_n_positives, tf.float32), tf.int32) + batch_n_neg_select = tf.minimum(batch_n_neg_select, tf.cast(batch_n_negtives, tf.int32)) + + # hard negative mining for classification + predictions_for_bg = tf.nn.softmax(tf.reshape(cls_pred, [tf.shape(features)[0], -1, params['num_classes']]))[:, :, 0] + prob_for_negtives = tf.where(batch_negtive_mask, + 0. - predictions_for_bg, + # ignore all the positives + 0. - tf.ones_like(predictions_for_bg)) + topk_prob_for_bg, _ = tf.nn.top_k(prob_for_negtives, k=tf.shape(prob_for_negtives)[1]) + score_at_k = tf.gather_nd(topk_prob_for_bg, tf.stack([tf.range(tf.shape(features)[0]), batch_n_neg_select - 1], axis=-1)) + + selected_neg_mask = prob_for_negtives >= tf.expand_dims(score_at_k, axis=-1) + + # include both selected negtive and all positive examples + final_mask = tf.stop_gradient(tf.logical_or(tf.reshape(tf.logical_and(batch_negtive_mask, selected_neg_mask), [-1]), positive_mask)) + total_examples = tf.count_nonzero(final_mask) + + cls_pred = tf.boolean_mask(cls_pred, final_mask) + location_pred = tf.boolean_mask(location_pred, tf.stop_gradient(positive_mask)) + flaten_cls_targets = tf.boolean_mask(tf.clip_by_value(flaten_cls_targets, 0, params['num_classes']), final_mask) + flaten_loc_targets = tf.stop_gradient(tf.boolean_mask(flaten_loc_targets, positive_mask)) + + predictions = { + 'classes': tf.argmax(cls_pred, axis=-1), + 'probabilities': tf.reduce_max(tf.nn.softmax(cls_pred, name='softmax_tensor'), axis=-1), + 'loc_predict': bboxes_pred } + + cls_accuracy = tf.metrics.accuracy(flaten_cls_targets, predictions['classes']) + metrics = {'cls_accuracy': cls_accuracy} + + # Create a tensor named train_accuracy for logging purposes. + tf.identity(cls_accuracy[1], name='cls_accuracy') + tf.summary.scalar('cls_accuracy', cls_accuracy[1]) + + if mode == tf.estimator.ModeKeys.PREDICT: + return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) + + # Calculate loss, which includes softmax cross entropy and L2 regularization. + #cross_entropy = tf.cond(n_positives > 0, lambda: tf.losses.sparse_softmax_cross_entropy(labels=flaten_cls_targets, logits=cls_pred), lambda: 0.)# * (params['negative_ratio'] + 1.) + #flaten_cls_targets=tf.Print(flaten_cls_targets, [flaten_loc_targets],summarize=50000) + cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=flaten_cls_targets, logits=cls_pred) * (params['negative_ratio'] + 1.) + # Create a tensor named cross_entropy for logging purposes. + tf.identity(cross_entropy, name='cross_entropy_loss') + tf.summary.scalar('cross_entropy_loss', cross_entropy) + + #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred)) + loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.) + #loc_loss = modified_smooth_l1(location_pred, tf.stop_gradient(gtargets)) + loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1), name='location_loss') + tf.summary.scalar('location_loss', loc_loss) + tf.losses.add_loss(loc_loss) + + l2_loss_vars = [] + for trainable_var in tf.trainable_variables(): + if '_bn' not in trainable_var.name: + if 'conv4_3_scale' not in trainable_var.name: + l2_loss_vars.append(tf.nn.l2_loss(trainable_var)) + else: + l2_loss_vars.append(tf.nn.l2_loss(trainable_var) * 0.1) + # Add weight decay to the loss. We exclude the batch norm variables because + # doing so leads to a small improvement in accuracy. + total_loss = tf.add(cross_entropy + loc_loss, tf.multiply(params['weight_decay'], tf.add_n(l2_loss_vars), name='l2_loss'), name='total_loss') + + if mode == tf.estimator.ModeKeys.TRAIN: + global_step = tf.train.get_or_create_global_step() + + lr_values = [params['learning_rate'] * decay for decay in params['lr_decay_factors']] + learning_rate = tf.train.piecewise_constant(tf.cast(global_step, tf.int32), + [int(_) for _ in params['decay_boundaries']], + lr_values) + truncated_learning_rate = tf.maximum(learning_rate, tf.constant(params['end_learning_rate'], dtype=learning_rate.dtype), name='learning_rate') + # Create a tensor named learning_rate for logging purposes. + tf.summary.scalar('learning_rate', truncated_learning_rate) + + optimizer = tf.train.MomentumOptimizer(learning_rate=truncated_learning_rate, + momentum=params['momentum']) + if params['use_amp']: + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) + # optimizer = tf.contrib.estimator.TowerOptimizer(optimizer) + + # Batch norm requires update_ops to be added as a train_op dependency. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(total_loss, global_step) + else: + train_op = None + + return tf.estimator.EstimatorSpec( + mode=mode, + predictions=predictions, + loss=total_loss, + train_op=train_op, + eval_metric_ops=metrics, + scaffold=tf.train.Scaffold(init_fn=get_init_fn())) + +def parse_comma_list(args): + return [float(s.strip()) for s in args.split(',')] + +def main(_): + # Using the Winograd non-fused algorithms provides a small performance boost. + os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' + + try: + from dltest import show_training_arguments + show_training_arguments(FLAGS) + except: + pass + + gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=FLAGS.num_cpu_threads, inter_op_parallelism_threads=FLAGS.num_cpu_threads, gpu_options=gpu_options) + + num_gpus = validate_batch_size_for_multi_gpu(FLAGS.batch_size) + + # Set up a RunConfig to only save checkpoints once per training cycle. + run_config = tf.estimator.RunConfig().replace( + save_checkpoints_secs=FLAGS.save_checkpoints_secs).replace( + save_checkpoints_steps=None).replace( + save_summary_steps=FLAGS.save_summary_steps).replace( + keep_checkpoint_max=5).replace( + tf_random_seed=FLAGS.tf_random_seed).replace( + log_step_count_steps=FLAGS.log_every_n_steps).replace( + session_config=config) + + # replicate_ssd_model_fn = tf.contrib.estimator.replicate_model_fn(ssd_model_fn, loss_reduction=tf.losses.Reduction.MEAN) + replicate_ssd_model_fn =ssd_model_fn + ssd_detector = tf.estimator.Estimator( + model_fn=replicate_ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, + params={ + 'num_gpus': num_gpus, + 'data_format': FLAGS.data_format, + 'batch_size': FLAGS.batch_size, + 'model_scope': FLAGS.model_scope, + 'num_classes': FLAGS.num_classes, + 'negative_ratio': FLAGS.negative_ratio, + 'match_threshold': FLAGS.match_threshold, + 'neg_threshold': FLAGS.neg_threshold, + 'weight_decay': FLAGS.weight_decay, + 'momentum': FLAGS.momentum, + 'learning_rate': FLAGS.learning_rate, + 'end_learning_rate': FLAGS.end_learning_rate, + 'decay_boundaries': parse_comma_list(FLAGS.decay_boundaries), + 'lr_decay_factors': parse_comma_list(FLAGS.lr_decay_factors), + 'use_amp':FLAGS.use_amp, + }) + tensors_to_log = { + 'lr': 'learning_rate', + 'ce': 'cross_entropy_loss', + 'loc': 'location_loss', + 'loss': 'total_loss', + 'l2': 'l2_loss', + 'acc': 'post_forward/cls_accuracy', + } + logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=FLAGS.log_every_n_steps, + formatter=lambda dicts: (', '.join(['%s=%.6f' % (k, v) for k, v in dicts.items()]))) + + #hook = tf.train.ProfilerHook(save_steps=50, output_dir='.', show_memory=True) + print('Starting a training cycle.') + ssd_detector.train(input_fn=input_pipeline(dataset_pattern='train-*', is_training=True, batch_size=FLAGS.batch_size), + hooks=[logging_hook], max_steps=FLAGS.max_number_of_steps) + +if __name__ == '__main__': + tf.logging.set_verbosity(tf.logging.INFO) + tf.app.run() + + + # cls_targets = tf.reshape(cls_targets, [-1]) + # match_scores = tf.reshape(match_scores, [-1]) + # loc_targets = tf.reshape(loc_targets, [-1, 4]) + + # # each positive examples has one label + # positive_mask = cls_targets > 0 + # n_positives = tf.count_nonzero(positive_mask) + + # negtive_mask = tf.logical_and(tf.equal(cls_targets, 0), match_scores > 0.) + # n_negtives = tf.count_nonzero(negtive_mask) + + # n_neg_to_select = tf.cast(params['negative_ratio'] * tf.cast(n_positives, tf.float32), tf.int32) + # n_neg_to_select = tf.minimum(n_neg_to_select, tf.cast(n_negtives, tf.int32)) + + # # hard negative mining for classification + # predictions_for_bg = tf.nn.softmax(cls_pred)[:, 0] + + # prob_for_negtives = tf.where(negtive_mask, + # 0. - predictions_for_bg, + # # ignore all the positives + # 0. - tf.ones_like(predictions_for_bg)) + # topk_prob_for_bg, _ = tf.nn.top_k(prob_for_negtives, k=n_neg_to_select) + # selected_neg_mask = prob_for_negtives > topk_prob_for_bg[-1] + + # # include both selected negtive and all positive examples + # final_mask = tf.stop_gradient(tf.logical_or(tf.logical_and(negtive_mask, selected_neg_mask), positive_mask)) + # total_examples = tf.count_nonzero(final_mask) + + # glabels = tf.boolean_mask(tf.clip_by_value(cls_targets, 0, FLAGS.num_classes), final_mask) + # cls_pred = tf.boolean_mask(cls_pred, final_mask) + # location_pred = tf.boolean_mask(location_pred, tf.stop_gradient(positive_mask)) + # loc_targets = tf.boolean_mask(loc_targets, tf.stop_gradient(positive_mask)) diff --git a/cv/detection/ssd/tensorflow/utility/anchor_manipulator.py b/cv/detection/ssd/tensorflow/utility/anchor_manipulator.py new file mode 100644 index 0000000000000000000000000000000000000000..de29ff4cf3e5895ae7aa886bd2e9b097ece2bac7 --- /dev/null +++ b/cv/detection/ssd/tensorflow/utility/anchor_manipulator.py @@ -0,0 +1,348 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +import math + +import tensorflow.compat.v1 as tf +tf.disable_eager_execution() +import numpy as np + + +# Fix bug by adjust the position of arguments on TensorFlow 1.15 +def tf_name_scope_wrapper(tf_name_scope): + def _real(*args, **kwargs): + total_args = len(args) + len(kwargs) + if total_args == len(args) == 2: + if isinstance(args[1], str): + return tf_name_scope(*args, **kwargs) + else: + return tf_name_scope(args[0], args[0], args[1]) + return tf_name_scope(*args, **kwargs) + return _real + +tf.name_scope = tf_name_scope_wrapper(tf.name_scope) + + +def areas(gt_bboxes): + with tf.name_scope('bboxes_areas', [gt_bboxes]): + ymin, xmin, ymax, xmax = tf.split(gt_bboxes, 4, axis=1) + return (xmax - xmin) * (ymax - ymin) + +def intersection(gt_bboxes, default_bboxes): + with tf.name_scope('bboxes_intersection', [gt_bboxes, default_bboxes]): + # num_anchors x 1 + ymin, xmin, ymax, xmax = tf.split(gt_bboxes, 4, axis=1) + # 1 x num_anchors + gt_ymin, gt_xmin, gt_ymax, gt_xmax = [tf.transpose(b, perm=[1, 0]) for b in tf.split(default_bboxes, 4, axis=1)] + # broadcast here to generate the full matrix + int_ymin = tf.maximum(ymin, gt_ymin) + int_xmin = tf.maximum(xmin, gt_xmin) + int_ymax = tf.minimum(ymax, gt_ymax) + int_xmax = tf.minimum(xmax, gt_xmax) + h = tf.maximum(int_ymax - int_ymin, 0.) + w = tf.maximum(int_xmax - int_xmin, 0.) + + return h * w +def iou_matrix(gt_bboxes, default_bboxes): + with tf.name_scope('iou_matrix', [gt_bboxes, default_bboxes]): + inter_vol = intersection(gt_bboxes, default_bboxes) + # broadcast + union_vol = areas(gt_bboxes) + tf.transpose(areas(default_bboxes), perm=[1, 0]) - inter_vol + + return tf.where(tf.equal(union_vol, 0.0), + tf.zeros_like(inter_vol), tf.truediv(inter_vol, union_vol)) + +def do_dual_max_match(overlap_matrix, low_thres, high_thres, ignore_between=True, gt_max_first=True): + ''' + overlap_matrix: num_gt * num_anchors + ''' + with tf.name_scope('dual_max_match', [overlap_matrix]): + # first match from anchors' side + anchors_to_gt = tf.argmax(overlap_matrix, axis=0) + # the matching degree + match_values = tf.reduce_max(overlap_matrix, axis=0) + + #positive_mask = tf.greater(match_values, high_thres) + less_mask = tf.less(match_values, low_thres) + between_mask = tf.logical_and(tf.less(match_values, high_thres), tf.greater_equal(match_values, low_thres)) + negative_mask = less_mask if ignore_between else between_mask + ignore_mask = between_mask if ignore_between else less_mask + # fill all negative positions with -1, all ignore positions is -2 + match_indices = tf.where(negative_mask, -1 * tf.ones_like(anchors_to_gt), anchors_to_gt) + match_indices = tf.where(ignore_mask, -2 * tf.ones_like(match_indices), match_indices) + + # negtive values has no effect in tf.one_hot, that means all zeros along that axis + # so all positive match positions in anchors_to_gt_mask is 1, all others are 0 + anchors_to_gt_mask = tf.one_hot(tf.clip_by_value(match_indices, -1, tf.cast(tf.shape(overlap_matrix)[0], tf.int64)), + tf.shape(overlap_matrix)[0], on_value=1, off_value=0, axis=0, dtype=tf.int32) + # match from ground truth's side + gt_to_anchors = tf.argmax(overlap_matrix, axis=1) + + if gt_max_first: + # the max match from ground truth's side has higher priority + left_gt_to_anchors_mask = tf.one_hot(gt_to_anchors, tf.shape(overlap_matrix)[1], on_value=1, off_value=0, axis=1, dtype=tf.int32) + else: + # the max match from anchors' side has higher priority + # use match result from ground truth's side only when the the matching degree from anchors' side is lower than position threshold + left_gt_to_anchors_mask = tf.cast(tf.logical_and(tf.reduce_max(anchors_to_gt_mask, axis=1, keep_dims=True) < 1, + tf.one_hot(gt_to_anchors, tf.shape(overlap_matrix)[1], + on_value=True, off_value=False, axis=1, dtype=tf.bool) + ), tf.int64) + # can not use left_gt_to_anchors_mask here, because there are many ground truthes match to one anchor, we should pick the highest one even when we are merging matching from ground truth side + left_gt_to_anchors_scores = overlap_matrix * tf.to_float(left_gt_to_anchors_mask) + # merge matching results from ground truth's side with the original matching results from anchors' side + # then select all the overlap score of those matching pairs + selected_scores = tf.gather_nd(overlap_matrix, tf.stack([tf.where(tf.reduce_max(left_gt_to_anchors_mask, axis=0) > 0, + tf.argmax(left_gt_to_anchors_scores, axis=0), + anchors_to_gt), + tf.range(tf.cast(tf.shape(overlap_matrix)[1], tf.int64))], axis=1)) + # return the matching results for both foreground anchors and background anchors, also with overlap scores + return tf.where(tf.reduce_max(left_gt_to_anchors_mask, axis=0) > 0, + tf.argmax(left_gt_to_anchors_scores, axis=0), + match_indices), selected_scores + +# def save_anchors(bboxes, labels, anchors_point): +# if not hasattr(save_image_with_bbox, "counter"): +# save_image_with_bbox.counter = 0 # it doesn't exist yet, so initialize it +# save_image_with_bbox.counter += 1 + +# np.save('./debug/bboxes_{}.npy'.format(save_image_with_bbox.counter), np.copy(bboxes)) +# np.save('./debug/labels_{}.npy'.format(save_image_with_bbox.counter), np.copy(labels)) +# np.save('./debug/anchors_{}.npy'.format(save_image_with_bbox.counter), np.copy(anchors_point)) +# return save_image_with_bbox.counter + +class AnchorEncoder(object): + def __init__(self, allowed_borders, positive_threshold, ignore_threshold, prior_scaling, clip=False): + super(AnchorEncoder, self).__init__() + self._all_anchors = None + self._allowed_borders = allowed_borders + self._positive_threshold = positive_threshold + self._ignore_threshold = ignore_threshold + self._prior_scaling = prior_scaling + self._clip = clip + + def center2point(self, center_y, center_x, height, width): + return center_y - height / 2., center_x - width / 2., center_y + height / 2., center_x + width / 2., + + def point2center(self, ymin, xmin, ymax, xmax): + height, width = (ymax - ymin), (xmax - xmin) + return ymin + height / 2., xmin + width / 2., height, width + + def encode_all_anchors(self, labels, bboxes, all_anchors, all_num_anchors_depth, all_num_anchors_spatial, debug=False): + # y, x, h, w are all in range [0, 1] relative to the original image size + # shape info: + # y_on_image, x_on_image: layers_shapes[0] * layers_shapes[1] + # h_on_image, w_on_image: num_anchors + assert (len(all_num_anchors_depth)==len(all_num_anchors_spatial)) and (len(all_num_anchors_depth)==len(all_anchors)), 'inconsist num layers for anchors.' + with tf.name_scope('encode_all_anchors'): + num_layers = len(all_num_anchors_depth) + list_anchors_ymin = [] + list_anchors_xmin = [] + list_anchors_ymax = [] + list_anchors_xmax = [] + tiled_allowed_borders = [] + for ind, anchor in enumerate(all_anchors): + anchors_ymin_, anchors_xmin_, anchors_ymax_, anchors_xmax_ = self.center2point(anchor[0], anchor[1], anchor[2], anchor[3]) + + list_anchors_ymin.append(tf.reshape(anchors_ymin_, [-1])) + list_anchors_xmin.append(tf.reshape(anchors_xmin_, [-1])) + list_anchors_ymax.append(tf.reshape(anchors_ymax_, [-1])) + list_anchors_xmax.append(tf.reshape(anchors_xmax_, [-1])) + + tiled_allowed_borders.extend([self._allowed_borders[ind]] * all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) + + anchors_ymin = tf.concat(list_anchors_ymin, 0, name='concat_ymin') + anchors_xmin = tf.concat(list_anchors_xmin, 0, name='concat_xmin') + anchors_ymax = tf.concat(list_anchors_ymax, 0, name='concat_ymax') + anchors_xmax = tf.concat(list_anchors_xmax, 0, name='concat_xmax') + + if self._clip: + anchors_ymin = tf.clip_by_value(anchors_ymin, 0., 1.) + anchors_xmin = tf.clip_by_value(anchors_xmin, 0., 1.) + anchors_ymax = tf.clip_by_value(anchors_ymax, 0., 1.) + anchors_xmax = tf.clip_by_value(anchors_xmax, 0., 1.) + + anchor_allowed_borders = tf.stack(tiled_allowed_borders, 0, name='concat_allowed_borders') + + inside_mask = tf.logical_and(tf.logical_and(anchors_ymin > -anchor_allowed_borders * 1., + anchors_xmin > -anchor_allowed_borders * 1.), + tf.logical_and(anchors_ymax < (1. + anchor_allowed_borders * 1.), + anchors_xmax < (1. + anchor_allowed_borders * 1.))) + + anchors_point = tf.stack([anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1) + + # save_anchors_op = tf.py_func(save_anchors, + # [bboxes, + # labels, + # anchors_point], + # tf.int64, stateful=True) + + # with tf.control_dependencies([save_anchors_op]): + overlap_matrix = iou_matrix(bboxes, anchors_point) * tf.cast(tf.expand_dims(inside_mask, 0), tf.float32) + matched_gt, gt_scores = do_dual_max_match(overlap_matrix, self._ignore_threshold, self._positive_threshold) + # get all positive matching positions + matched_gt_mask = matched_gt > -1 + matched_indices = tf.clip_by_value(matched_gt, 0, tf.int64.max) + # the labels here maybe chaos at those non-positive positions + gt_labels = tf.gather(labels, matched_indices) + # filter the invalid labels + gt_labels = gt_labels * tf.cast(matched_gt_mask, tf.int64) + # set those ignored positions to -1 + gt_labels = gt_labels + (-1 * tf.cast(matched_gt < -1, tf.int64)) + + gt_ymin, gt_xmin, gt_ymax, gt_xmax = tf.unstack(tf.gather(bboxes, matched_indices), 4, axis=-1) + + # transform to center / size. + gt_cy, gt_cx, gt_h, gt_w = self.point2center(gt_ymin, gt_xmin, gt_ymax, gt_xmax) + anchor_cy, anchor_cx, anchor_h, anchor_w = self.point2center(anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax) + # encode features. + # the prior_scaling (in fact is 5 and 10) is use for balance the regression loss of center and with(or height) + gt_cy = (gt_cy - anchor_cy) / anchor_h / self._prior_scaling[0] + gt_cx = (gt_cx - anchor_cx) / anchor_w / self._prior_scaling[1] + gt_h = tf.log(gt_h / anchor_h) / self._prior_scaling[2] + gt_w = tf.log(gt_w / anchor_w) / self._prior_scaling[3] + # now gt_localizations is our regression object, but also maybe chaos at those non-positive positions + if debug: + gt_targets = tf.stack([anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax], axis=-1) + else: + gt_targets = tf.stack([gt_cy, gt_cx, gt_h, gt_w], axis=-1) + # set all targets of non-positive positions to 0 + gt_targets = tf.expand_dims(tf.cast(matched_gt_mask, tf.float32), -1) * gt_targets + self._all_anchors = (anchor_cy, anchor_cx, anchor_h, anchor_w) + return gt_targets, gt_labels, gt_scores + + # return a list, of which each is: + # shape: [feature_h, feature_w, num_anchors, 4] + # order: ymin, xmin, ymax, xmax + def decode_all_anchors(self, pred_location, num_anchors_per_layer): + assert self._all_anchors is not None, 'no anchors to decode.' + with tf.name_scope('decode_all_anchors', [pred_location]): + anchor_cy, anchor_cx, anchor_h, anchor_w = self._all_anchors + + pred_h = tf.exp(pred_location[:, -2] * self._prior_scaling[2]) * anchor_h + pred_w = tf.exp(pred_location[:, -1] * self._prior_scaling[3]) * anchor_w + pred_cy = pred_location[:, 0] * self._prior_scaling[0] * anchor_h + anchor_cy + pred_cx = pred_location[:, 1] * self._prior_scaling[1] * anchor_w + anchor_cx + + return tf.split(tf.stack(self.center2point(pred_cy, pred_cx, pred_h, pred_w), axis=-1), num_anchors_per_layer, axis=0) + + def ext_decode_all_anchors(self, pred_location, all_anchors, all_num_anchors_depth, all_num_anchors_spatial): + assert (len(all_num_anchors_depth)==len(all_num_anchors_spatial)) and (len(all_num_anchors_depth)==len(all_anchors)), 'inconsist num layers for anchors.' + with tf.name_scope('ext_decode_all_anchors', [pred_location]): + num_anchors_per_layer = [] + for ind in range(len(all_anchors)): + num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) + + num_layers = len(all_num_anchors_depth) + list_anchors_ymin = [] + list_anchors_xmin = [] + list_anchors_ymax = [] + list_anchors_xmax = [] + tiled_allowed_borders = [] + for ind, anchor in enumerate(all_anchors): + anchors_ymin_, anchors_xmin_, anchors_ymax_, anchors_xmax_ = self.center2point(anchor[0], anchor[1], anchor[2], anchor[3]) + + list_anchors_ymin.append(tf.reshape(anchors_ymin_, [-1])) + list_anchors_xmin.append(tf.reshape(anchors_xmin_, [-1])) + list_anchors_ymax.append(tf.reshape(anchors_ymax_, [-1])) + list_anchors_xmax.append(tf.reshape(anchors_xmax_, [-1])) + + anchors_ymin = tf.concat(list_anchors_ymin, 0, name='concat_ymin') + anchors_xmin = tf.concat(list_anchors_xmin, 0, name='concat_xmin') + anchors_ymax = tf.concat(list_anchors_ymax, 0, name='concat_ymax') + anchors_xmax = tf.concat(list_anchors_xmax, 0, name='concat_xmax') + + anchor_cy, anchor_cx, anchor_h, anchor_w = self.point2center(anchors_ymin, anchors_xmin, anchors_ymax, anchors_xmax) + + pred_h = tf.exp(pred_location[:,-2] * self._prior_scaling[2]) * anchor_h + pred_w = tf.exp(pred_location[:, -1] * self._prior_scaling[3]) * anchor_w + pred_cy = pred_location[:, 0] * self._prior_scaling[0] * anchor_h + anchor_cy + pred_cx = pred_location[:, 1] * self._prior_scaling[1] * anchor_w + anchor_cx + + return tf.split(tf.stack(self.center2point(pred_cy, pred_cx, pred_h, pred_w), axis=-1), num_anchors_per_layer, axis=0) + +class AnchorCreator(object): + def __init__(self, img_shape, layers_shapes, anchor_scales, extra_anchor_scales, anchor_ratios, layer_steps): + super(AnchorCreator, self).__init__() + # img_shape -> (height, width) + self._img_shape = img_shape + self._layers_shapes = layers_shapes + self._anchor_scales = anchor_scales + self._extra_anchor_scales = extra_anchor_scales + self._anchor_ratios = anchor_ratios + self._layer_steps = layer_steps + self._anchor_offset = [0.5] * len(self._layers_shapes) + + def get_layer_anchors(self, layer_shape, anchor_scale, extra_anchor_scale, anchor_ratio, layer_step, offset = 0.5): + ''' assume layer_shape[0] = 6, layer_shape[1] = 5 + x_on_layer = [[0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4]] + y_on_layer = [[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, 3], + [4, 4, 4, 4, 4], + [5, 5, 5, 5, 5]] + ''' + with tf.name_scope('get_layer_anchors'): + x_on_layer, y_on_layer = tf.meshgrid(tf.range(layer_shape[1]), tf.range(layer_shape[0])) + + y_on_image = (tf.cast(y_on_layer, tf.float32) + offset) * layer_step / self._img_shape[0] + x_on_image = (tf.cast(x_on_layer, tf.float32) + offset) * layer_step / self._img_shape[1] + + num_anchors_along_depth = len(anchor_scale) * len(anchor_ratio) + len(extra_anchor_scale) + num_anchors_along_spatial = layer_shape[1] * layer_shape[0] + + list_h_on_image = [] + list_w_on_image = [] + + global_index = 0 + # for square anchors + for _, scale in enumerate(extra_anchor_scale): + list_h_on_image.append(scale) + list_w_on_image.append(scale) + global_index += 1 + # for other aspect ratio anchors + for scale_index, scale in enumerate(anchor_scale): + for ratio_index, ratio in enumerate(anchor_ratio): + list_h_on_image.append(scale / math.sqrt(ratio)) + list_w_on_image.append(scale * math.sqrt(ratio)) + global_index += 1 + # shape info: + # y_on_image, x_on_image: layers_shapes[0] * layers_shapes[1] + # h_on_image, w_on_image: num_anchors_along_depth + return tf.expand_dims(y_on_image, axis=-1), tf.expand_dims(x_on_image, axis=-1), \ + tf.constant(list_h_on_image, dtype=tf.float32), \ + tf.constant(list_w_on_image, dtype=tf.float32), num_anchors_along_depth, num_anchors_along_spatial + + def get_all_anchors(self): + all_anchors = [] + all_num_anchors_depth = [] + all_num_anchors_spatial = [] + for layer_index, layer_shape in enumerate(self._layers_shapes): + anchors_this_layer = self.get_layer_anchors(layer_shape, + self._anchor_scales[layer_index], + self._extra_anchor_scales[layer_index], + self._anchor_ratios[layer_index], + self._layer_steps[layer_index], + self._anchor_offset[layer_index]) + all_anchors.append(anchors_this_layer[:-2]) + all_num_anchors_depth.append(anchors_this_layer[-2]) + all_num_anchors_spatial.append(anchors_this_layer[-1]) + return all_anchors, all_num_anchors_depth, all_num_anchors_spatial + diff --git a/cv/detection/ssd/tensorflow/utility/anchor_manipulator_unittest.py b/cv/detection/ssd/tensorflow/utility/anchor_manipulator_unittest.py new file mode 100644 index 0000000000000000000000000000000000000000..bbacc6416c2c0003bada31f8bce400a3ada83396 --- /dev/null +++ b/cv/detection/ssd/tensorflow/utility/anchor_manipulator_unittest.py @@ -0,0 +1,156 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import tensorflow as tf +from scipy.misc import imread, imsave, imshow, imresize +import numpy as np +import sys; sys.path.insert(0, ".") +from utility import draw_toolbox +from utility import anchor_manipulator +from preprocessing import ssd_preprocessing + +slim = tf.contrib.slim + +def save_image_with_bbox(image, labels_, scores_, bboxes_): + if not hasattr(save_image_with_bbox, "counter"): + save_image_with_bbox.counter = 0 # it doesn't exist yet, so initialize it + save_image_with_bbox.counter += 1 + + img_to_draw = np.copy(image) + + img_to_draw = draw_toolbox.bboxes_draw_on_img(img_to_draw, labels_, scores_, bboxes_, thickness=2) + imsave(os.path.join('./debug/{}.jpg').format(save_image_with_bbox.counter), img_to_draw) + return save_image_with_bbox.counter + +def slim_get_split(file_pattern='{}_????'): + # Features in Pascal VOC TFRecords. + keys_to_features = { + 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), + 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), + 'image/height': tf.FixedLenFeature([1], tf.int64), + 'image/width': tf.FixedLenFeature([1], tf.int64), + 'image/channels': tf.FixedLenFeature([1], tf.int64), + 'image/shape': tf.FixedLenFeature([3], tf.int64), + 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), + 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), + 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), + 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), + } + items_to_handlers = { + 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), + 'shape': slim.tfexample_decoder.Tensor('image/shape'), + 'object/bbox': slim.tfexample_decoder.BoundingBox( + ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), + 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), + 'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), + 'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), + } + decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) + + dataset = slim.dataset.Dataset( + data_sources=file_pattern, + reader=tf.TFRecordReader, + decoder=decoder, + num_samples=100, + items_to_descriptions=None, + num_classes=21, + labels_to_names=None) + + with tf.name_scope('dataset_data_provider'): + provider = slim.dataset_data_provider.DatasetDataProvider( + dataset, + num_readers=2, + common_queue_capacity=32, + common_queue_min=8, + shuffle=True, + num_epochs=1) + + [org_image, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(['image', 'shape', + 'object/label', + 'object/bbox', + 'object/difficult']) + image, glabels, gbboxes = ssd_preprocessing.preprocess_image(org_image, glabels_raw, gbboxes_raw, [300, 300], is_training=True, data_format='channels_last', output_rgb=True) + + anchor_creator = anchor_manipulator.AnchorCreator([300] * 2, + layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], + anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)], + extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)], + anchor_ratios = [(2., .5), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., 3., .5, 0.3333), (2., .5), (2., .5)], + layer_steps = [8, 16, 32, 64, 100, 300]) + + all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors() + + num_anchors_per_layer = [] + for ind in range(len(all_anchors)): + num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind]) + + anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders=[1.0] * 6, + positive_threshold = 0.5, + ignore_threshold = 0.5, + prior_scaling=[0.1, 0.1, 0.2, 0.2]) + + gt_targets, gt_labels, gt_scores = anchor_encoder_decoder.encode_all_anchors(glabels, gbboxes, all_anchors, all_num_anchors_depth, all_num_anchors_spatial, True) + + anchors = anchor_encoder_decoder._all_anchors + # split by layers + gt_targets, gt_labels, gt_scores, anchors = tf.split(gt_targets, num_anchors_per_layer, axis=0),\ + tf.split(gt_labels, num_anchors_per_layer, axis=0),\ + tf.split(gt_scores, num_anchors_per_layer, axis=0),\ + [tf.split(anchor, num_anchors_per_layer, axis=0) for anchor in anchors] + + save_image_op = tf.py_func(save_image_with_bbox, + [ssd_preprocessing.unwhiten_image(image), + tf.clip_by_value(tf.concat(gt_labels, axis=0), 0, tf.int64.max), + tf.concat(gt_scores, axis=0), + tf.concat(gt_targets, axis=0)], + tf.int64, stateful=True) + return save_image_op + +if __name__ == '__main__': + save_image_op = slim_get_split('/media/rs/7A0EE8880EE83EAF/Detections/SSD/dataset/tfrecords/train*') + # Create the graph, etc. + init_op = tf.group([tf.local_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()]) + + # Create a session for running operations in the Graph. + sess = tf.Session() + # Initialize the variables (like the epoch counter). + sess.run(init_op) + + # Start input enqueue threads. + coord = tf.train.Coordinator() + threads = tf.train.start_queue_runners(sess=sess, coord=coord) + + try: + while not coord.should_stop(): + # Run training steps or whatever + print(sess.run(save_image_op)) + + except tf.errors.OutOfRangeError: + print('Done training -- epoch limit reached') + finally: + # When done, ask the threads to stop. + coord.request_stop() + + # Wait for threads to finish. + coord.join(threads) + sess.close() diff --git a/cv/detection/ssd/tensorflow/utility/checkpint_inspect.py b/cv/detection/ssd/tensorflow/utility/checkpint_inspect.py new file mode 100644 index 0000000000000000000000000000000000000000..2979e88cedc2564f326506f4412c8949e9dfb0a1 --- /dev/null +++ b/cv/detection/ssd/tensorflow/utility/checkpint_inspect.py @@ -0,0 +1,55 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python import pywrap_tensorflow + +def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors): + try: + reader = pywrap_tensorflow.NewCheckpointReader(file_name) + if all_tensors: + var_to_shape_map = reader.get_variable_to_shape_map() + for key in var_to_shape_map: + print("tensor_name: ", key) + print(reader.get_tensor(key)) + elif not tensor_name: + print(reader.debug_string().decode("utf-8")) + else: + print("tensor_name: ", tensor_name) + print(reader.get_tensor(tensor_name)) + except Exception as e: # pylint: disable=broad-except + print(str(e)) + if "corrupted compressed block contents" in str(e): + print("It's likely that your checkpoint file has been compressed " + "with SNAPPY.") + +def print_all_tensors_name(file_name): + try: + reader = pywrap_tensorflow.NewCheckpointReader(file_name) + var_to_shape_map = reader.get_variable_to_shape_map() + for key in var_to_shape_map: + print(key) + except Exception as e: # pylint: disable=broad-except + print(str(e)) + if "corrupted compressed block contents" in str(e): + print("It's likely that your checkpoint file has been compressed " + "with SNAPPY.") + +if __name__ == "__main__": + print_all_tensors_name('./model/vgg16_reducedfc.ckpt') diff --git a/cv/detection/ssd/tensorflow/utility/draw_toolbox.py b/cv/detection/ssd/tensorflow/utility/draw_toolbox.py new file mode 100644 index 0000000000000000000000000000000000000000..a72ae50ecdf48cb10af98c0795f0cb865f66daf0 --- /dev/null +++ b/cv/detection/ssd/tensorflow/utility/draw_toolbox.py @@ -0,0 +1,73 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +import cv2 +import matplotlib.cm as mpcm + +from dataset import dataset_common + +def gain_translate_table(): + label2name_table = {} + for class_name, labels_pair in dataset_common.VOC_LABELS.items(): + label2name_table[labels_pair[0]] = class_name + return label2name_table + +label2name_table = gain_translate_table() + +def colors_subselect(colors, num_classes=21): + dt = len(colors) // num_classes + sub_colors = [] + for i in range(num_classes): + color = colors[i*dt] + if isinstance(color[0], float): + sub_colors.append([int(c * 255) for c in color]) + else: + sub_colors.append([c for c in color]) + return sub_colors + +colors = colors_subselect(mpcm.plasma.colors, num_classes=21) +colors_tableau = [(255, 255, 255), (31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120), + (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150), + (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148), + (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199), + (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)] + +def bboxes_draw_on_img(img, classes, scores, bboxes, thickness=2): + shape = img.shape + scale = 0.4 + text_thickness = 1 + line_type = 8 + for i in range(bboxes.shape[0]): + if classes[i] < 1: continue + bbox = bboxes[i] + color = colors_tableau[classes[i]] + # Draw bounding boxes + p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1])) + p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1])) + if (p2[0] - p1[0] < 1) or (p2[1] - p1[1] < 1): + continue + + cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness) + # Draw text + s = '%s/%.1f%%' % (label2name_table[classes[i]], scores[i]*100) + # text_size is (width, height) + text_size, baseline = cv2.getTextSize(s, cv2.FONT_HERSHEY_SIMPLEX, scale, text_thickness) + p1 = (p1[0] - text_size[1], p1[1]) + + cv2.rectangle(img, (p1[1] - thickness//2, p1[0] - thickness - baseline), (p1[1] + text_size[0], p1[0] + text_size[1]), color, -1) + + cv2.putText(img, s, (p1[1], p1[0] + baseline), cv2.FONT_HERSHEY_SIMPLEX, scale, (255,255,255), text_thickness, line_type) + + return img + diff --git a/cv/detection/ssd/tensorflow/utility/scaffolds.py b/cv/detection/ssd/tensorflow/utility/scaffolds.py new file mode 100644 index 0000000000000000000000000000000000000000..f19b7fcd9cbad93b703fef374fac7a0fa83a7fbf --- /dev/null +++ b/cv/detection/ssd/tensorflow/utility/scaffolds.py @@ -0,0 +1,86 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +import tensorflow.compat.v1 as tf + +def get_init_fn_for_scaffold(model_dir, checkpoint_path, model_scope, checkpoint_model_scope, checkpoint_exclude_scopes, ignore_missing_vars, name_remap=None): + if tf.train.latest_checkpoint(model_dir): + tf.logging.info('Ignoring --checkpoint_path because a checkpoint already exists in %s.' % model_dir) + return None + exclusion_scopes = [] + if checkpoint_exclude_scopes: + exclusion_scopes = [scope.strip() for scope in checkpoint_exclude_scopes.split(',')] + + variables_to_restore = [] + for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): + excluded = False + for exclusion in exclusion_scopes: + if exclusion in var.op.name:#.startswith(exclusion): + excluded = True + break + if not excluded: + variables_to_restore.append(var) + if checkpoint_model_scope is not None: + if checkpoint_model_scope.strip() == '': + variables_to_restore = {var.op.name.replace(model_scope + '/', ''): var for var in variables_to_restore} + else: + variables_to_restore = {var.op.name.replace(model_scope, checkpoint_model_scope.strip()): var for var in variables_to_restore} + if name_remap is not None: + renamed_variables_to_restore = dict() + for var_name, var in variables_to_restore.items(): + found = False + for k, v in name_remap.items(): + if k in var_name: + renamed_variables_to_restore[var_name.replace(k, v)] = var + found = True + break + if not found: + renamed_variables_to_restore[var_name] = var + variables_to_restore = renamed_variables_to_restore + + checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) if tf.gfile.IsDirectory(checkpoint_path) else checkpoint_path + + tf.logging.info('Fine-tuning from %s. Ignoring missing vars: %s.' % (checkpoint_path, ignore_missing_vars)) + + if not variables_to_restore: + raise ValueError('variables_to_restore cannot be empty') + if ignore_missing_vars: + reader = tf.train.NewCheckpointReader(checkpoint_path) + if isinstance(variables_to_restore, dict): + var_dict = variables_to_restore + else: + var_dict = {var.op.name: var for var in variables_to_restore} + available_vars = {} + for var in var_dict: + if reader.has_tensor(var): + available_vars[var] = var_dict[var] + else: + tf.logging.warning('Variable %s missing in checkpoint %s.', var, checkpoint_path) + variables_to_restore = available_vars + if variables_to_restore: + saver = tf.train.Saver(variables_to_restore, reshape=False) + saver.build() + def callback(scaffold, session): + saver.restore(session, checkpoint_path) + return callback + else: + tf.logging.warning('No Variables to restore.') + return None diff --git a/cv/detection/ssd/tensorflow/voc_eval.py b/cv/detection/ssd/tensorflow/voc_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..d15d848ddf623450bd053c3a1a49e731e6ca2bd5 --- /dev/null +++ b/cv/detection/ssd/tensorflow/voc_eval.py @@ -0,0 +1,268 @@ +# Copyright 2018 Changan Wang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import os +import numpy as np +import pickle + +if sys.version_info[0] == 2: + import xml.etree.cElementTree as ET +else: + import xml.etree.ElementTree as ET + +from dataset import dataset_common + +''' +VOC2007TEST + Annotations + ... + ImageSets +''' +dataset_path = '/media/rs/7A0EE8880EE83EAF/Detections/PASCAL/VOC/VOC2007TEST' +# change above path according to your system settings +pred_path = './logs/predict' +pred_file = 'results_{}.txt' # from 1-num_classes +output_path = './logs/predict/eval_output' +cache_path = './logs/predict/eval_cache' +anno_files = 'Annotations/{}.xml' +all_images_file = 'ImageSets/Main/test.txt' + +def parse_rec(filename): + """ Parse a PASCAL VOC xml file """ + tree = ET.parse(filename) + objects = [] + for obj in tree.findall('object'): + obj_struct = {} + obj_struct['name'] = obj.find('name').text + obj_struct['pose'] = obj.find('pose').text + obj_struct['truncated'] = int(obj.find('truncated').text) + obj_struct['difficult'] = int(obj.find('difficult').text) + bbox = obj.find('bndbox') + obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1, + int(bbox.find('ymin').text) - 1, + int(bbox.find('xmax').text) - 1, + int(bbox.find('ymax').text) - 1] + objects.append(obj_struct) + + return objects + +def do_python_eval(use_07=True): + aps = [] + # The PASCAL VOC metric changed in 2010 + use_07_metric = use_07 + print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) + if not os.path.isdir(output_path): + os.mkdir(output_path) + for cls_name, cls_pair in dataset_common.VOC_LABELS.items(): + if 'none' in cls_name: + continue + cls_id = cls_pair[0] + filename = os.path.join(pred_path, pred_file.format(cls_id)) + rec, prec, ap = voc_eval(filename, os.path.join(dataset_path, anno_files), + os.path.join(dataset_path, all_images_file), cls_name, cache_path, + ovthresh=0.5, use_07_metric=use_07_metric) + aps += [ap] + print('AP for {} = {:.4f}'.format(cls_name, ap)) + with open(os.path.join(output_path, cls_name + '_pr.pkl'), 'wb') as f: + pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) + print('Mean AP = {:.4f}'.format(np.mean(aps))) + print('~~~~~~~~') + print('Results:') + for ap in aps: + print('{:.3f}'.format(ap)) + print('{:.3f}'.format(np.mean(aps))) + print('~~~~~~~~') + print('') + print('--------------------------------------------------------------') + print('Results computed with the **unofficial** Python eval code.') + print('Results should be very close to the official MATLAB eval code.') + print('--------------------------------------------------------------') + + +def voc_ap(rec, prec, use_07_metric=True): + """ ap = voc_ap(rec, prec, [use_07_metric]) + Compute VOC AP given precision and recall. + If use_07_metric is true, uses the + VOC 07 11 point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def voc_eval(detpath, + annopath, + imagesetfile, + classname, + cachedir, + ovthresh=0.5, + use_07_metric=True): + """rec, prec, ap = voc_eval(detpath, + annopath, + imagesetfile, + classname, + [ovthresh], + [use_07_metric]) + Top level function that does the PASCAL VOC evaluation. + detpath: Path to detections + detpath.format(classname) should produce the detection results file. + annopath: Path to annotations + annopath.format(imagename) should be the xml annotations file. + imagesetfile: Text file containing the list of images, one image per line. + classname: Category name (duh) + cachedir: Directory for caching the annotations + [ovthresh]: Overlap threshold (default = 0.5) + [use_07_metric]: Whether to use VOC07's 11 point AP computation + (default False) + """ + # assumes detections are in detpath.format(classname) + # assumes annotations are in annopath.format(imagename) + # assumes imagesetfile is a text file with each line an image name + # cachedir caches the annotations in a pickle file + # first load gt + if not os.path.isdir(cachedir): + os.mkdir(cachedir) + cachefile = os.path.join(cachedir, 'annots.pkl') + # read list of images + with open(imagesetfile, 'r') as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + if not os.path.isfile(cachefile): + # load annots + recs = {} + for i, imagename in enumerate(imagenames): + recs[imagename] = parse_rec(annopath.format(imagename)) + if i % 100 == 0: + print('Reading annotation for {:d}/{:d}'.format( + i + 1, len(imagenames))) + # save + print('Saving cached annotations to {:s}'.format(cachefile)) + with open(cachefile, 'wb') as f: + pickle.dump(recs, f) + else: + # load + with open(cachefile, 'rb') as f: + recs = pickle.load(f) + + # extract gt objects for this class + class_recs = {} + npos = 0 + + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj['name'] == classname] + bbox = np.array([x['bbox'] for x in R]) + difficult = np.array([x['difficult'] for x in R]).astype(np.bool) + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {'bbox': bbox, + 'difficult': difficult, + 'det': det} + # read dets + with open(detpath, 'r') as f: + lines = f.readlines() + + if any(lines) == 1: + + splitlines = [x.strip().split(' ') for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + sorted_scores = np.sort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin, 0.) + ih = np.maximum(iymax - iymin, 0.) + inters = iw * ih + uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + + (BBGT[:, 2] - BBGT[:, 0]) * + (BBGT[:, 3] - BBGT[:, 1]) - inters) + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R['difficult'][jmax]: + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + else: + rec = -1. + prec = -1. + ap = -1. + + return rec, prec, ap + +if __name__ == '__main__': + do_python_eval()