『TensorFlow』SSD源码学习_其五：TFR数据读取&数据预处理

Fork版本项目地址：SSD

一、TFR数据读取

创建slim.dataset.Dataset对象

在train_ssd_network.py获取数据操作如下，首先需要slim.dataset.Dataset对象

# Select the dataset.

# 'imagenet', 'train', tfr文件存储位置

# TFR文件命名格式：'voc_2012_%s_*.tfrecord'，%s使用train或者test

dataset = dataset_factory.get_dataset(

    FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

获取过程会经过一系列臃肿的调用，我把中间被调用的函数（们）写在了下面，由上到下依次调用：

def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None):

    """

    Returns:

        A `Dataset` class.

    Raises:

        ValueError: If the dataset `name` is unknown.

    """

    if name not in datasets_map:

        raise ValueError('Name of dataset unknown %s' % name)

    # pascalvoc_2012.get_split

    return datasets_map[name].get_split(split_name,

                                        dataset_dir,

                                        file_pattern,

                                        reader)

def get_split(split_name, dataset_dir, file_pattern=None, reader=None):

    """

    Returns:

      A `Dataset` namedtuple.

    Raises:

        ValueError: if `split_name` is not a valid train/test split.

    """

    if not file_pattern:

        file_pattern = FILE_PATTERN  # 需要文件命名格式满足：'voc_2012_%s_*.tfrecord'

    return pascalvoc_common.get_split(split_name, dataset_dir,

                                      file_pattern, reader,

                                      SPLITS_TO_SIZES,  # {'train': 17125,}

                                      ITEMS_TO_DESCRIPTIONS,

                                      NUM_CLASSES  # 20

                                      )

    """

    ITEMS_TO_DESCRIPTIONS = {

    'image': 'A color image of varying height and width.',

    'shape': 'Shape of the image',

    'object/bbox': 'A list of bounding boxes, one per each object.',

    'object/label': 'A list of labels, one per each object.',

    }

    """

最终调用，获取slim.dataset.Dataset（解析见『TensorFlow』从磁盘读取数据），实际上能够传入满足slim.dataset.Dataset的参数即可：

def get_split(split_name, dataset_dir, file_pattern, reader,

              split_to_sizes, items_to_descriptions, num_classes):

    """Gets a dataset tuple with instructions for reading Pascal VOC dataset.

    Args:

      split_name: A train/test split name.

      dataset_dir: The base directory of the dataset sources.

      file_pattern: The file pattern to use when matching the dataset sources.

        It is assumed that the pattern contains a '%s' string so that the split

        name can be inserted.

      reader: The TensorFlow reader type.

    Returns:

      A `Dataset` namedtuple.

    Raises:

        ValueError: if `split_name` is not a valid train/test split.

    """

    # 'train'

    if split_name not in split_to_sizes:

        raise ValueError('split name %s was not recognized.' % split_name)

    file_pattern = os.path.join(dataset_dir, file_pattern % split_name)

    # Allowing None in the signature so that dataset_factory can use the default.

    if reader is None:

        reader = tf.TFRecordReader

    # Features in Pascal VOC TFRecords.

    keys_to_features = {  # 解码TFR文件方式

        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),

        'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),

        'image/height': tf.FixedLenFeature([1], tf.int64),

        'image/width': tf.FixedLenFeature([1], tf.int64),

        'image/channels': tf.FixedLenFeature([1], tf.int64),

        'image/shape': tf.FixedLenFeature([3], tf.int64),

        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),

        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),

        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),

        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),

        'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),

        'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),

        'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),

    }

    items_to_handlers = {  # 解码二进制数据条目

        'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),

        'shape': slim.tfexample_decoder.Tensor('image/shape'),

        'object/bbox': slim.tfexample_decoder.BoundingBox(

                ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),

        'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),

        'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),

        'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),

    }

    # 解码实施

    decoder = slim.tfexample_decoder.TFExampleDecoder(

        keys_to_features, items_to_handlers)

    labels_to_names = None

    # tf.gfile.Exists(os.path.join(dataset_dir, 'labels.txt'))

    if dataset_utils.has_labels(dataset_dir):

        labels_to_names = dataset_utils.read_label_file(dataset_dir)

    # else:

    #     labels_to_names = create_readable_names_for_imagenet_labels()

    #     dataset_utils.write_label_file(labels_to_names, dataset_dir)

    return slim.dataset.Dataset(

            data_sources=file_pattern,                    # TFR文件名

            reader=reader,                                # 阅读器

            decoder=decoder,                              # 解码Tensor

            num_samples=split_to_sizes[split_name],       # 数目

            items_to_descriptions=items_to_descriptions,  # decoder条目描述字段

            num_classes=num_classes,                      # 类别数

            labels_to_names=labels_to_names               # 字典{图片:类别,……}

    )

''' items_to_descriptions:

    {'image': 'A color image of varying height and width.',

     'shape': 'Shape of the image',

     'object/bbox': 'A list of bounding boxes, one per each object.',

     'object/label': 'A list of labels, one per each object.',}

'''

这里额外说一句，存储数据中ymin、xmin、ymax、xmax格子存储为(n,)的shape（n表示图像中对象数目），但是在进行了items_to_handlers之后，新的handlers：object/bbox形状变化为(n, 4)，由于这涉及到多目标检测后续一系列处理，所以值得注意。

从TFR中获取 batch数据

            with tf.name_scope(FLAGS.dataset_name + '_data_provider'):

                provider = slim.dataset_data_provider.DatasetDataProvider(

                    dataset,  # DatasetDataProvider 需要 slim.dataset.Dataset 做参数

                    num_readers=FLAGS.num_readers,

                    common_queue_capacity=20 * FLAGS.batch_size,

                    common_queue_min=10 * FLAGS.batch_size,

                    shuffle=True)

            # Get for SSD network: image, labels, bboxes.c

            # DatasetDataProvider可以通过TFR字段获取batch size数据

            [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',

                                                             'object/label',

                                                             'object/bbox'])

此时数据已经获取完毕，预处理之后即可加入运算。

注意，直到现在为止，我们仅对图片数据进行了解码，并没有扩充维度，也就是说其维度依然是3维。

二、数据处理

获取对应数据集的预处里函数，并使用其处理上面小结中获取的batch数据，

image_preprocessing_fn = preprocessing_factory.get_preprocessing(

            preprocessing_name, is_training=True)

# Pre-processing image, labels and bboxes.

image, glabels, gbboxes = \

    image_preprocessing_fn(image, glabels, gbboxes,

                           out_shape=ssd_shape,  # (300,300)

                           data_format=DATA_FORMAT)  # 'NCHW'

有的时候你会觉得这种层层调用非常的sb……下面两步依旧是个调用链，

def get_preprocessing(name, is_training=False):

    preprocessing_fn_map = {

        'ssd_300_vgg': ssd_vgg_preprocessing,

        'ssd_512_vgg': ssd_vgg_preprocessing,

    }

    if name not in preprocessing_fn_map:

        raise ValueError('Preprocessing name [%s] was not recognized' % name)

    def preprocessing_fn(image, labels, bboxes,

                         out_shape, data_format='NHWC', **kwargs):

        return preprocessing_fn_map[name].preprocess_image(

            image, labels, bboxes, out_shape, data_format=data_format,

            is_training=is_training, **kwargs)

    return preprocessing_fn

def preprocess_image(image,

                     labels,

                     bboxes,

                     out_shape,

                     data_format,

                     is_training=False,

                     **kwargs):

    if is_training:

        return preprocess_for_train(image, labels, bboxes,

                                    out_shape=out_shape,

                                    data_format=data_format)

    else:

        return preprocess_for_eval(image, labels, bboxes,

                                   out_shape=out_shape,

                                   data_format=data_format,

                                   **kwargs)

之后就是数据具体的预处理函数，本篇我们仅仅关注训练预处理。

训练数据预处理概览

大致流程是：

有条件的在原图上裁剪一个区域

计算裁剪后区域和各个标注框的重叠，视阈值保留bboxes和labels

裁剪出来的图片放大到输入图片大小（bbox都是归一化的，不需要放缩）

随机翻转（bbox要同步翻转）

其他预处理（不涉及bbox）

返回image, labels, bboxes

def preprocess_for_train(image, labels, bboxes,

                         out_shape, data_format='NHWC',

                         scope='ssd_preprocessing_train'):

    """Preprocesses the given image for training.

    """

    fast_mode = False

    with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]):

        if image.get_shape().ndims != 3:

            raise ValueError('Input must be of size [height, width, C>0]')

        # Convert to float scaled [0, 1].

        if image.dtype != tf.float32:

            image = tf.image.convert_image_dtype(image, dtype=tf.float32)

        tf_summary_image(image, bboxes, 'image_with_bboxes')

        # 上面保证了图片是3维的tf.float32格式

        # （有条件的）随机裁剪，筛选调整后的labels(n,)、bboxes(n, 4)，裁剪图片对应原图坐标(4,)

        dst_image, labels, bboxes, distort_bbox = \

            distorted_bounding_box_crop(image, labels, bboxes,

                                        min_object_covered=MIN_OBJECT_COVERED,  # 0.25

                                        aspect_ratio_range=CROP_RATIO_RANGE)  # (0.6, 1.67)

        # Resize image to output size.

        dst_image = tf_image.resize_image(dst_image, out_shape,

                                          method=tf.image.ResizeMethod.BILINEAR,

                                          align_corners=False)

        tf_summary_image(dst_image, bboxes, 'image_shape_distorted')

        # Randomly flip the image horizontally.

        dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes)

        # Randomly distort the colors. There are 4 ways to do it.

        dst_image = apply_with_random_selector(

                dst_image,

                lambda x, ordering: distort_color(x, ordering, fast_mode),

                num_cases=4)

        tf_summary_image(dst_image, bboxes, 'image_color_distorted')

        # Rescale to VGG input scale.

        image = dst_image * 255.

        image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])

        # mean = tf.constant(means, dtype=image.dtype)

        # image = image - mean

        # Image data format.

        if data_format == 'NCHW':

            image = tf.transpose(image, perm=(2, 0, 1))

        # 'NHWC' (n,) (n, 4)

        return image, labels, bboxes

裁剪图片并调整labels、bboxes

整体流程如下，

调用内置函数保证裁剪的大小范围以及一定会包含一些关注目标，返回裁剪参数

裁剪（注意保留裁剪位置参数）图片

计算裁剪框和各个检测框的重叠，并设置阈值舍弃、调整保留框坐标

def distorted_bounding_box_crop(image,

                                labels,

                                bboxes,

                                min_object_covered=0.3,

                                aspect_ratio_range=(0.9, 1.1),

                                area_range=(0.1, 1.0),

                                max_attempts=200,

                                clip_bboxes=True,

                                scope=None):

    """Generates cropped_image using a one of the bboxes randomly distorted.

    See `tf.image.sample_distorted_bounding_box` for more documentation.

    Args:

        image: 3-D Tensor of image (it will be converted to floats in [0, 1]).

        bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]

            where each coordinate is [0, 1) and the coordinates are arranged

            as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole

            image.

        min_object_covered: An optional `float`. Defaults to `0.1`. The cropped

            area of the image must contain at least this fraction of any bounding box

            supplied.

        aspect_ratio_range: An optional list of `floats`. The cropped area of the

            image must have an aspect ratio = width / height within this range.

        area_range: An optional list of `floats`. The cropped area of the image

            must contain a fraction of the supplied image within in this range.

        max_attempts: An optional `int`. Number of attempts at generating a cropped

            region of the image of the specified constraints. After `max_attempts`

            failures, return the entire image.

        scope: Optional scope for name_scope.

    Returns:

        A tuple, a 3-D Tensor cropped_image and the distorted bbox

    """

    with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]):

        # 高级的随机裁剪

        # The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width

        # and height of the underlying image.

        # 1-D, 1-D, [1, 1, 4]

        bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box(

                tf.shape(image),

                bounding_boxes=tf.expand_dims(bboxes, 0),  # [1, n, 4]

                min_object_covered=min_object_covered,

                aspect_ratio_range=aspect_ratio_range,

                area_range=area_range,

                max_attempts=max_attempts,

                use_image_if_no_bounding_boxes=True)

        '''

        Returns:

            A tuple of `Tensor` objects (begin, size, bboxes).

        begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`.

            Provide as input to `tf.slice`.

        size: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`.

            Provide as input to `tf.slice`.

        bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.

            Provide as input to `tf.image.draw_bounding_boxes`.

        '''

        # [4]

        distort_bbox = distort_bbox[0, 0]

        # Crop the image to the specified bounding box.

        cropped_image = tf.slice(image, bbox_begin, bbox_size)

        # Restore the shape since the dynamic slice loses 3rd dimension.

        cropped_image.set_shape([None, None, 3])  # <-----设置了尺寸了哈

        # Update bounding boxes: resize and filter out.

        bboxes = tfe.bboxes_resize(distort_bbox, bboxes)  # [4], [n, 4]

        labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,

                                                   threshold=BBOX_CROP_OVERLAP,  # 0.5

                                                   assign_negative=False)

        # 返回随机裁剪的图片，筛选调整后的labels(n,)、bboxes(n, 4)，裁剪图片对应原图坐标(4,)

        return cropped_image, labels, bboxes, distort_bbox

三个关键函数：

tf.image.sample_distorted_bounding_box 裁剪，用法查看文档，就是裁剪一个子图，返回最后参数是子图坐标

bboxes_resize 框坐标原点置为裁剪框左上角点，xy单位长度置为裁剪框wh（归一化）

bboxes_filter_overlap 计算重叠区/原框的百分比，舍弃达不到阈值的labels和bboxes

其中第二个函数我们前面并未强调，但是，由于所有的涉及框坐标的计算都是基于图像坐标归一化之后（tf内置函数都是这样），所以这一步计算是必要的，将坐标系由原图（注意是图，这也导致了两者单位长度差别很大）转换为裁剪框，并设定单位长度。

def bboxes_resize(bbox_ref, bboxes, name=None):

    # Tensors inputs.

    with tf.name_scope(name, 'bboxes_resize'):

        # Translate.

        # bbox_ref:['ymin', 'xmin', 'ymax', 'xmax']

        v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]])

        bboxes = bboxes - v

        # Scale.

        s = tf.stack([bbox_ref[2] - bbox_ref[0],  # h

                      bbox_ref[3] - bbox_ref[1],  # w

                      bbox_ref[2] - bbox_ref[0],

                      bbox_ref[3] - bbox_ref[1]])

        bboxes = bboxes / s

        return bboxes

def bboxes_filter_overlap(labels, bboxes,

                          threshold=0.5, assign_negative=False,

                          scope=None):

    """Filter out bounding boxes based on (relative )overlap with reference

    box [0, 0, 1, 1].  Remove completely bounding boxes, or assign negative

    labels to the one outside (useful for latter processing...).

    Return:

      labels, bboxes: Filtered (or newly assigned) elements.

    """

    with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]):

        # (N,) Tensor：和[0,0,1,1]相交面积大于0的位置返回面积比（相交/原本），小于0的位置返回0

        scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype),

                                     bboxes)

        mask = scores > threshold

        if assign_negative:  # 保留所有的label和框，重叠区不够的label置负

            labels = tf.where(mask, labels, -labels)  # 交叉满足的标记为正，否则为负

        else:  # 删除重叠区不够的label和框

            labels = tf.boolean_mask(labels, mask)  # bool掩码，类似于array的bool切片

            bboxes = tf.boolean_mask(bboxes, mask)

        return labels, bboxes

# 被上面函数调用，计算相交（和裁剪框）面积占原框面积比值

def bboxes_intersection(bbox_ref, bboxes, name=None):

    """Compute relative intersection between a reference box and a

    collection of bounding boxes. Namely, compute the quotient between

    intersection area and box area.

    Args:

      bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es).

      bboxes: (N, 4) Tensor, collection of bounding boxes.

    Return:

      (N,) Tensor with relative intersection.

    """

    with tf.name_scope(name, 'bboxes_intersection'):

        # Should be more efficient to first transpose.

        bboxes = tf.transpose(bboxes)

        bbox_ref = tf.transpose(bbox_ref)

        # Intersection bbox and volume.

        int_ymin = tf.maximum(bboxes[0], bbox_ref[0])

        int_xmin = tf.maximum(bboxes[1], bbox_ref[1])

        int_ymax = tf.minimum(bboxes[2], bbox_ref[2])

        int_xmax = tf.minimum(bboxes[3], bbox_ref[3])

        h = tf.maximum(int_ymax - int_ymin, 0.)

        w = tf.maximum(int_xmax - int_xmin, 0.)

        # Volumes.

        inter_vol = h * w  # 各个框在[0,0,1,1]内的面积

        bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1])  # 各个框面积

        scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection')

        # from tensorflow.python.ops import math_ops

        # 大于0的位置返回面积比，小于0的位置返回0

        # tf.where(math_ops.greater(bboxes_vol, 0),  # 返回bool表是否大于0

        #          math_ops.divide(inter_vol, bboxes_vol),

        #          tf.zeros_like(inter_vol), name=name)

        return scores

其他预处理函数没什么特别注意的，不多介绍，自行查看源码即可。

至此，数据预处理完成，我们给出自从TFR中获取数据到预处理完成的局部代码，如下，

        with tf.device(deploy_config.inputs_device()):

            with tf.name_scope(FLAGS.dataset_name + '_data_provider'):

                provider = slim.dataset_data_provider.DatasetDataProvider(

                    dataset,  # DatasetDataProvider 需要 slim.dataset.Dataset 做参数

                    num_readers=FLAGS.num_readers,

                    common_queue_capacity=20 * FLAGS.batch_size,

                    common_queue_min=10 * FLAGS.batch_size,

                    shuffle=True)

            # Get for SSD network: image, labels, bboxes.c

            # DatasetDataProvider可以通过TFR字段获取batch size数据

            [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',

                                                             'object/label',

                                                             'object/bbox'])

            # Pre-processing image, labels and bboxes.

            # 'CHW' (n,) (n, 4)

            image, glabels, gbboxes = \

                image_preprocessing_fn(image, glabels, gbboxes,

                                       out_shape=ssd_shape,  # (300,300)

                                       data_format=DATA_FORMAT)  # 'NCHW'