1

Hello i'm trying to convert yolov3-spp.weights to .tf model i found how to convert yolov3 yolov3_tiny but i couldnt convert the yolov3_spp any solutions !

1 Answer 1

2

Modify your Darknet53 model like:

elif (block["type"] == "route"):
            block["layers"] = block["layers"].split(',')
            start = int(block["layers"][0])

            if len(block["layers"]) > 1 and int(block["layers"][1]) > 0:
                end = int(block["layers"][1]) - i
                filters = output_filters[i + start] + output_filters[end]  # Index negatif :end - index
                inputs = tf.concat([outputs[i + start], outputs[i + end]], axis=-1)
            elif len(block["layers"]) > 1 and int(block["layers"][1]) < 0: # for SPP (Spatial Pyramid Pooling) models layers=-1,-3,-5,-6
                filters = sum( output_filters[i + int(offset)] for offset in block["layers"] )
                inputs = tf.concat( [outputs[i + int(offset)] for offset in block["layers"]], axis=-1 )
            else:
                filters = output_filters[i + start]
                inputs = outputs[i + start]

The key is in here:

            elif len(block["layers"]) > 1 and int(block["layers"][1]) < 0: # for SPP (Spatial Pyramid Pooling) models layers=-1,-3,-5,-6
                filters = sum( output_filters[i + int(offset)] for offset in block["layers"] )
                inputs = tf.concat( [outputs[i + int(offset)] for offset in block["layers"]], axis=-1 )

The filters will be the sum of the needed output_filters and the input will be the concatenated tensor of the output of the needed layers.

Find the full code here:

'''
https://machinelearningspace.com/yolov3-tensorflow-2-part-2/
https://stackoverflow.com/questions/45175469/typeerror-concat-got-multiple-values-for-argument-axis
'''
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import BatchNormalization, Conv2D, \
    Input, ZeroPadding2D, LeakyReLU, UpSampling2D

def parse_cfg(cfgfile):
    """
    Parse the configuration file for YOLOv3

    Args:
        cfgfile : str : config file
    Returns:
        blocks : list -> [dict,...,dict] : all config atributes
    """
    with open(cfgfile, 'r') as file:
        lines = [line.rstrip('\n') for line in file if line != '\n' and line[0] != '#']
    holder = {}
    blocks = []
    # Read atributes line by line and save them.
    for line in lines:
        if line[0] == '[':
            line = 'type=' + line[1:-1].rstrip()
            if len(holder) != 0:
                blocks.append(holder)
                holder = {}
        key, value = line.split("=")
        holder[key.rstrip()] = value.lstrip()
    blocks.append(holder)
    return blocks

def YOLOv3Net(cfgfile, model_size, num_classes):
    """
    Darknet53Conv2D
        cfgfile : str : config file
        model_size : tuple : 3x1 tuple like (height, width, layers)
                            e.g. (608, 608, 3)
        num_classes : int : number of classes
    """

    blocks = parse_cfg(cfgfile)

    outputs = {}
    output_filters = []
    filters = []
    out_pred = []
    scale = 0

    # We define the input model using Keras function and divided by 255 to normalize it to the range of 0–1.
    inputs = input_image = Input(shape=model_size)
    inputs = inputs / 255.0

    # For every iteration, we check the type of the block which corresponds to the type of layer.
    for i, block in enumerate(blocks[1:]):
        # If it is a convolutional layer
        if (block["type"] == "convolutional"):
            activation = block["activation"]
            filters = int(block["filters"])
            kernel_size = int(block["size"])
            strides = int(block["stride"])
            if strides > 1:
                inputs = ZeroPadding2D(((1, 0), (1, 0)))(inputs)
            inputs = Conv2D(filters,
                            kernel_size,
                            strides=strides,
                            padding='valid' if strides > 1 else 'same',
                            name='conv_' + str(i),
                            use_bias=False if ("batch_normalize" in block) else True)(inputs)
            if "batch_normalize" in block:
                inputs = BatchNormalization(name='bnorm_' + str(i))(inputs)
                inputs = LeakyReLU(alpha=0.1, name='leaky_' + str(i))(inputs)

        elif (block["type"] == "upsample"):
            stride = int(block["stride"])
            inputs = UpSampling2D(stride)(inputs)

        # If it is a route layer
        elif (block["type"] == "route"):
            block["layers"] = block["layers"].split(',')
            start = int(block["layers"][0])

            if len(block["layers"]) > 1 and int(block["layers"][1]) > 0:
                end = int(block["layers"][1]) - i
                filters = output_filters[i + start] + output_filters[end]  # Index negatif :end - index
                inputs = tf.concat([outputs[i + start], outputs[i + end]], axis=-1)
            elif len(block["layers"]) > 1 and int(block["layers"][1]) < 0: # for SPP (Spatial Pyramid Pooling) models layers=-1,-3,-5,-6
                filters = sum( output_filters[i + int(offset)] for offset in block["layers"] )
                inputs = tf.concat( [outputs[i + int(offset)] for offset in block["layers"]], axis=-1 )
            else:
                filters = output_filters[i + start]
                inputs = outputs[i + start]

        elif block["type"] == "shortcut":
            from_ = int(block["from"])
            inputs = outputs[i - 1] + outputs[i + from_]
        
        # Yolo detection layer
        elif block["type"] == "yolo":
            mask = block["mask"].split(",")
            mask = [int(x) for x in mask]
            anchors = block["anchors"].split(",")
            anchors = [int(a) for a in anchors]
            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
            anchors = [anchors[i] for i in mask]
            n_anchors = len(anchors)

            # reshape the YOLOv3 output to the form of [None, B * grid size * grid size, 5 + C].
            # The B is the number of anchors and C is the number of classes.
            out_shape = inputs.get_shape().as_list()

            inputs = tf.reshape(inputs, [-1, n_anchors * out_shape[1] * out_shape[2], \
                                         5 + num_classes])
            
            # Then access all boxes attributes by this way:
            box_centers = inputs[:, :, 0:2]
            box_shapes = inputs[:, :, 2:4]
            confidence = inputs[:, :, 4:5]
            classes = inputs[:, :, 5:num_classes + 5]

            # Refine Bounding Boxes
            # Use the sigmoid function to convert box_centers, confidence,
            # and classes values into range of 0 – 1.
            box_centers = tf.sigmoid(box_centers)
            confidence = tf.sigmoid(confidence)
            classes = tf.sigmoid(classes)

            # Then convert box_shapes as the following:
            anchors = tf.tile(anchors, [out_shape[1] * out_shape[2], 1])
            box_shapes = tf.exp(box_shapes) * tf.cast(anchors, dtype=tf.float32)

            # Use a meshgrid to convert the relative positions of the center boxes into
            # the real positions.
            x = tf.range(out_shape[1], dtype=tf.float32)
            y = tf.range(out_shape[2], dtype=tf.float32)

            cx, cy = tf.meshgrid(x, y)
            cx = tf.reshape(cx, (-1, 1))
            cy = tf.reshape(cy, (-1, 1))
            cxy = tf.concat([cx, cy], axis=-1)
            cxy = tf.tile(cxy, [1, n_anchors])
            cxy = tf.reshape(cxy, [1, -1, 2])

            strides = (input_image.shape[1] // out_shape[1], \
                       input_image.shape[2] // out_shape[2])
            box_centers = (box_centers + cxy) * strides

            # Then, concatenate them all together.
            prediction = tf.concat([box_centers, box_shapes, confidence, classes], axis=-1)

            # YOLOv3 does 3 predictions across the scale. We do as it is.
            # Take the prediction result for each scale and concatenate it with the others.
            if scale:
                out_pred = tf.concat([out_pred, prediction], axis=1)
            else:
                out_pred = prediction
                scale = 1

        # Since the route and shortcut layers need output feature maps from previous layers,
        # so for every iteration, we always keep the track of the feature maps and output filters.
        outputs[i] = inputs
        output_filters.append(filters)

    # Finally, we can return our model.
    model = Model(input_image, out_pred)
    model.summary()
    return model

And the conversor here:

#convert_weights.py
import numpy as np
from yolov3 import YOLOv3Net
from yolov3 import parse_cfg
import argparse

def load_weights(model,cfgfile,weightfile):

    # Open the weights file
    with open(weightfile, 'rb') as fp:
        print("\t--->weightfile open")
        
        # The first 5 values are header information
        np.fromfile(fp, dtype=np.int32, count=5)

        blocks = parse_cfg(cfgfile)

        for i, block in enumerate(blocks[1:]):

            if (block["type"] == "convolutional"):
                conv_layer = model.get_layer('conv_' + str(i))
                print("layer: ",i+1,conv_layer)

                filters = conv_layer.filters
                k_size = conv_layer.kernel_size[0]
                in_dim = conv_layer.input_shape[-1]

                if "batch_normalize" in block:

                    norm_layer = model.get_layer('bnorm_' + str(i))
                    print("layer: ",i+1,norm_layer)
                    size = np.prod(norm_layer.get_weights()[0].shape)

                    bn_weights = np.fromfile(fp, dtype=np.float32, count=4 * filters)
                    # tf [gamma, beta, mean, variance]
                    bn_weights = bn_weights.reshape((4, filters))[[1, 0, 2, 3]]

                else:
                    conv_bias = np.fromfile(fp, dtype=np.float32, count=filters)

                # darknet shape (out_dim, in_dim, height, width)
                conv_shape = (filters, in_dim, k_size, k_size)
                conv_weights = np.fromfile(
                    fp, dtype=np.float32, count=np.product(conv_shape))
                # tf shape (height, width, in_dim, out_dim)
                conv_weights = conv_weights.reshape(
                    conv_shape).transpose([2, 3, 1, 0])

                if "batch_normalize" in block:
                    norm_layer.set_weights(bn_weights)
                    conv_layer.set_weights([conv_weights])
                else:
                    conv_layer.set_weights([conv_weights, conv_bias])

        # assert len(fp.read()) == 0, 'failed to read all data'

def parser():
    ap = argparse.ArgumentParser()
    ap.add_argument("-w", "--weightfile", required=True, \
        help="Path to weight file")
    ap.add_argument("-c", "--cfgfile", required=True, \
        help="Path to cfg file")
    return vars(ap.parse_args())

def main():

    args = parser()

    weightfile = args['weightfile']
    cfgfile = args['cfgfile']

    model_size = (608, 608, 3)
    num_classes = 1

    model=YOLOv3Net(cfgfile,model_size,num_classes)
    load_weights(model,cfgfile,weightfile)

    try:
        model.save_weights('weights/yolov3_weights.tf')
        print('\nThe file \'yolov3_weights.tf\' has been saved successfully.')
    except IOError:
        print("Couldn't write the file \'yolov3_weights.tf\'.")

if __name__ == "__main__":
    main()

PS: I got my original code from here and then I modified it.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Not the answer you're looking for? Browse other questions tagged or ask your own question.