[OpenCV实战]7 使用YOLOv3和OpenCV进行基于深度学习的目标检测
1.3 在Darknet和OpenCV上对YOLOv3进行速度测试
在这篇文章中,我们将学习如何在OpenCV上使用YOLOv3(目标检测网络)。YOLOv3是检测算法YOLO的最新变种已发布的模型可识别图像和视频中的80个不同对象,但最重要的是它具有超快速且几乎与Single Shot MultiBox(SSD)一样准确。从OpenCV 3.4.2开始,您可以在自己的OpenCV应用程序中轻松使用YOLOv3模型。
1 YOLO介绍
1.1 YOLOv3原理
在传统的计算机视觉方法中,使用滑动窗口来寻找不同位置和尺度的物体。因为这是非常耗时的操作,所以通常假设物体的纵横比是固定的。基于早期深度学习的对象检测算法(如R-CNN和Fast R-CNN)使用称为选择性搜索的方法来扫描图像。另一种称为Overfeat的方法涉及使用卷积式滑动窗口机制在多个尺度上扫描图像。紧随其后的是更快的R-CNN,它使用区域提议网络(RPN)来识别需要测试的边界框。通过巧妙的设计,提取用于识别对象的特征也被RPN用于提出潜在的边界框,从而节省了大量的计算。
Faster R-CNN使用区域候选网络(RPN)来识别需要测试的边界框。通过巧妙的设计,提取用于识别对象的特征也被RPN用于提出潜在的边界框,从而节省了大量的计算。以上的目标检测网络都是两阶段检测器,目标定位和分类是分开的。
接下来说说单阶段检测器,也就是ssd和mobilenet。另一方面,YOLO以完全不同的方式处理对象检测问题。SSD是另一种物体检测算法。YOLOv3和ssd只要获取图像就能直接得到目标检测结果,但YOLOv3比SSD快得多,同时实现了非常不错的精度。YOLOv3在M40,TitanX或1080 Ti GPU上速度很快。
1.2 为什么要将OpenCV用于YOLO?
- 与OpenCV应用程序轻松集成:如果您的应用程序已经使用OpenCV并且您只想使用YOLOv3,则无需担心编译和构建额外的Darknet代码。
- OpenCV CPU版本快9倍:OpenCV的DNN模块CPU实现速度惊人。例如,与OpenMP一起使用时,Darknet在CPU上花费大约2秒钟来对单个图像进行推理。相比之下,OpenCV的实现只需0.22秒!查看下表。
- Python支持:Darknet是用C语言编写的,并没有正式支持Python。相比之下,OpenCV确实如此。虽然有Darknet可用的python端口。
1.3 在Darknet和OpenCV上对YOLOv3进行速度测试
下表显示了YOLOv3在Darknet与OpenCV上的性能。所有情况下的输入大小为416×416。毫无疑问,Darknet的GPU版本优于其他任何东西。使用OpenMP(并行计算工具)的Darknet比没有OpenMP的Darknet工作得更好也不足为奇,因为OpenMP允许使用多个处理器。令人惊讶的是,OpenCV的DNN CPU实现速度比使用OpenML的Darknet快9倍。
OS |
Framework |
Time(ms)/Frame |
Linux 16.04 |
Darknet |
12x Intel Core i7-6850K CPU @ 3.60GHz |
9370 |
Linux 16.04 |
Darknet + OpenMP |
12x Intel Core i7-6850K CPU @ 3.60GHz |
1942 |
Linux 16.04 |
OpenCV [CPU] |
12x Intel Core i7-6850K CPU @ 3.60GHz |
220 |
Linux 16.04 |
Darknet |
NVIDIA GeForce 1080 Ti GPU |
23 |
macOS |
DarkNet |
2.5 GHz Intel Core i7 CPU |
7260 |
macOS |
OpenCV [CPU] |
2.5 GHz Intel Core i7 CPU |
400 |
OpenCV的DNN GPU仅使用英特尔的GPU进行测试,因此如果您没有英特尔GPU,代码会将您切换回CPU。所以A卡就算了。
2 使用YOLOv3进行对象检测(C++/Python)
2.1 模型及配置文件下载
2.2 初始化参数
2.3 加载模型和获取输入图像
// Give the configuration and weight files for the model 模型参数文件
String modelConfiguration = "./model/yolov3.cfg";
String modelWeights = "./model/yolov3.weights";
// Load names of classes 读取分类类名
string classesFile = "./model/coco.names";
ifstream ifs(classesFile.c_str());
string line;
while (getline(ifs, line))
// Load the network 导入网络
Net net = readNetFromDarknet(modelConfiguration, modelWeights);
2.4 单帧图像处理
Mat blob;
clock_t start, finish;
Mat frame = imread("bird.jpg");
//resize(frame, frame, Size(300, 300));
start = clock();
// Create a 4D blob from a frame. 创建神经网络输入图像
blobFromImage(frame, blob, 1 / 255.0, cvSize(inpWidth, inpHeight), Scalar(0, 0, 0), true, false);
//Sets the input to the network 设置输出
// Runs the forward pass to get output of the output layers 获取输出层结果
vector<Mat> outs;
net.forward(outs, getOutputsNames(net));
// Remove the bounding boxes with low confidence
postprocess(frame, outs);
finish = clock();
cout << "time is " << double(finish - start) / CLOCKS_PER_SEC << endl;
// Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
vector<double> layersTimes;
double freq = getTickFrequency() / 1000;
double t = net.getPerfProfile(layersTimes) / freq;
string label = format("Inference time for a frame : %.2f ms", t);
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 255));
2.4.1 获取输出层的名称
// Get the names of the output layers 获取输出层
* @brief Get the Outputs Names object
* @param net
* @return vector<String>
vector<String> getOutputsNames(const Net& net)
static vector<String> names;
if (names.empty())
//Get the indices of the output layers, i.e. the layers with unconnected outputs
vector<int> outLayers = net.getUnconnectedOutLayers();
//get the names of all the layers in the network
vector<String> layersNames = net.getLayerNames();
// Get the names of the output layers in names
for (size_t i = 0; i < outLayers.size(); ++i)
names[i] = layersNames[outLayers[i] - 1];
return names;
2.4.2 处理网络的输出
* @brief Remove the bounding boxes with low confidence using non-maxima suppression 基于非极大性抑制去除边框
* @param frame 视频图像
* @param outs 输出层结果
void postprocess(Mat& frame, const vector<Mat>& outs)
vector<int> classIds;
vector<float> confidences;
vector<Rect> boxes;
for (size_t i = 0; i < outs.size(); ++i)
// Scan through all the bounding boxes output from the network and keep only the
// ones with high confidence scores. Assign the box's class label as the class
// with the highest score for the box.
float* data = (float*)outs[i].data;
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
Point classIdPoint;
double confidence;
// Get the value and location of the maximum score 获取置信度和位置参数
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
if (confidence > confThreshold)
int centerX = (int)(data[0] * frame.cols);
int centerY = (int)(data[1] * frame.rows);
int width = (int)(data[2] * frame.cols);
int height = (int)(data[3] * frame.rows);
int left = centerX - width / 2;
int top = centerY - height / 2;
boxes.push_back(Rect(left, top, width, height));
// Perform non maximum suppression to eliminate redundant overlapping boxes with
// lower confidences
vector<int> indices;
NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
for (size_t i = 0; i < indices.size(); ++i)
int idx = indices[i];
Rect box = boxes[idx];
drawPred(classIds[idx], confidences[idx], box.x, box.y,
box.x + box.width, box.y + box.height, frame);
2.4.3 画预测结果框格
* @brief Draw the predicted bounding box 画框
* @param classId 类别
* @param conf 置信度
* @param left
* @param top
* @param right
* @param bottom
* @param frame
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
//Draw a rectangle displaying the bounding box
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(255, 178, 50), 3);
//Get the label for the class name and its confidence
string label = format("%.2f", conf);
if (!classes.empty())
CV_Assert(classId < (int)classes.size());
label = classes[classId] + ":" + label;
//Display the label at the top of the bounding box 在每个框左上角标上标签
int baseLine;
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
top = max(top, labelSize.height);
rectangle(frame, Point(left, top - round(1.5*labelSize.height)), Point(left + round(1.5*labelSize.width), top + baseLine), Scalar(255, 255, 255), FILLED);
putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 0, 0), 1);
3 结果和代码
3.1 结果
3.2 代码
#include "pch.h"
#include <fstream>
#include <sstream>
#include <iostream>
#include <time.h>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
using namespace cv;
using namespace dnn;
using namespace std;
// Initialize the parameters 初始参数
// Confidence threshold 置信度阈值
float confThreshold = 0.5;
// Non-maximum suppression threshold 非极大性抑制阈值
float nmsThreshold = 0.4;
int inpWidth = 416;
int inpHeight = 416;
vector<string> classes;
// Remove the bounding boxes with low confidence using non-maxima suppression
// 基于非极大性抑制去除低置信度的检测框
void postprocess(Mat& frame, const vector<Mat>& out);
// 画预测框
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);
// 提取输出输出层
vector<String> getOutputsNames(const Net& net);
int main()
// Give the configuration and weight files for the model 模型参数文件
String modelConfiguration = "./model/yolov3.cfg";
String modelWeights = "./model/yolov3.weights";
// Load names of classes 读取分类类名
string classesFile = "./model/coco.names";
ifstream ifs(classesFile.c_str());
string line;
while (getline(ifs, line))
// Load the network 导入网络
Net net = readNetFromDarknet(modelConfiguration, modelWeights);
// Open a video file or an image file or a camera stream.
string str, outputFile;
Mat blob;
clock_t start, finish;
Mat frame = imread("test.jpg");
//resize(frame, frame, Size(300, 300));
start = clock();
// Create a 4D blob from a frame. 创建神经网络输入图像
blobFromImage(frame, blob, 1 / 255.0, cvSize(inpWidth, inpHeight), Scalar(0, 0, 0), true, false);
//Sets the input to the network 设置输出
// Runs the forward pass to get output of the output layers 获取输出层结果
vector<Mat> outs;
net.forward(outs, getOutputsNames(net));
// Remove the bounding boxes with low confidence
postprocess(frame, outs);
finish = clock();
cout << "time is " << double(finish - start) / CLOCKS_PER_SEC << endl;
// Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
vector<double> layersTimes;
double freq = getTickFrequency() / 1000;
double t = net.getPerfProfile(layersTimes) / freq;
string label = format("Inference time for a frame : %.2f ms", t);
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 255));
imshow("result", frame);
imwrite("result.jpg", frame);
return 0;
* @brief Remove the bounding boxes with low confidence using non-maxima suppression 基于非极大性抑制去除边框
* @param frame 视频图像
* @param outs 输出层结果
void postprocess(Mat& frame, const vector<Mat>& outs)
vector<int> classIds;
vector<float> confidences;
vector<Rect> boxes;
for (size_t i = 0; i < outs.size(); ++i)
// Scan through all the bounding boxes output from the network and keep only the
// ones with high confidence scores. Assign the box's class label as the class
// with the highest score for the box.
float* data = (float*)outs[i].data;
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
Point classIdPoint;
double confidence;
// Get the value and location of the maximum score 获取置信度和位置参数
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
if (confidence > confThreshold)
int centerX = (int)(data[0] * frame.cols);
int centerY = (int)(data[1] * frame.rows);
int width = (int)(data[2] * frame.cols);
int height = (int)(data[3] * frame.rows);
int left = centerX - width / 2;
int top = centerY - height / 2;
boxes.push_back(Rect(left, top, width, height));
// Perform non maximum suppression to eliminate redundant overlapping boxes with
// lower confidences
vector<int> indices;
NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
for (size_t i = 0; i < indices.size(); ++i)
int idx = indices[i];
Rect box = boxes[idx];
drawPred(classIds[idx], confidences[idx], box.x, box.y,
box.x + box.width, box.y + box.height, frame);
* @brief Draw the predicted bounding box 画框
* @param classId 类别
* @param conf 置信度
* @param left
* @param top
* @param right
* @param bottom
* @param frame
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
//Draw a rectangle displaying the bounding box
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(255, 178, 50), 3);
//Get the label for the class name and its confidence
string label = format("%.2f", conf);
if (!classes.empty())
CV_Assert(classId < (int)classes.size());
label = classes[classId] + ":" + label;
//Display the label at the top of the bounding box 在每个框左上角标上标签
int baseLine;
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
top = max(top, labelSize.height);
rectangle(frame, Point(left, top - round(1.5*labelSize.height)), Point(left + round(1.5*labelSize.width), top + baseLine), Scalar(255, 255, 255), FILLED);
putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 0, 0), 1);
// Get the names of the output layers 获取输出层
* @brief Get the Outputs Names object
* @param net
* @return vector<String>
vector<String> getOutputsNames(const Net& net)
static vector<String> names;
if (names.empty())
//Get the indices of the output layers, i.e. the layers with unconnected outputs
vector<int> outLayers = net.getUnconnectedOutLayers();
//get the names of all the layers in the network
vector<String> layersNames = net.getLayerNames();
// Get the names of the output layers in names
for (size_t i = 0; i < outLayers.size(); ++i)
names[i] = layersNames[outLayers[i] - 1];
return names;
# This code is written at BigVision LLC. It is based on the OpenCV project. It is subject to the license terms in the LICENSE file found in this distribution and at http://opencv.org/license.html
# Usage example: python3 object_detection_yolo.py --video=run.mp4
# python3 object_detection_yolo.py --image=bird.jpg
import cv2 as cv
import argparse
import sys
import numpy as np
import os.path
# Initialize the parameters
confThreshold = 0.5 #Confidence threshold
nmsThreshold = 0.4 #Non-maximum suppression threshold
inpWidth = 416 #Width of network's input image
inpHeight = 416 #Height of network's input image
parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
parser.add_argument('--image', help='Path to image file.')
parser.add_argument('--video', help='Path to video file.')
args = parser.parse_args()
# Load names of classes
classesFile = "coco.names";
classes = None
with open(classesFile, 'rt') as f:
classes = f.read().rstrip('\n').split('\n')
# Give the configuration and weight files for the model and load the network using them.
modelConfiguration = "yolov3.cfg";
modelWeights = "yolov3.weights";
net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
# Get the names of the output layers
def getOutputsNames(net):
# Get the names of all the layers in the network
layersNames = net.getLayerNames()
# Get the names of the output layers, i.e. the layers with unconnected outputs
return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# Draw the predicted bounding box
def drawPred(classId, conf, left, top, right, bottom):
# Draw a bounding box.
cv.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)
label = '%.2f' % conf
# Get the label for the class name and its confidence
if classes:
assert(classId < len(classes))
label = '%s:%s' % (classes[classId], label)
#Display the label at the top of the bounding box
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
top = max(top, labelSize[1])
cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (255, 255, 255), cv.FILLED)
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 1)
# Remove the bounding boxes with low confidence using non-maxima suppression
def postprocess(frame, outs):
frameHeight = frame.shape[0]
frameWidth = frame.shape[1]
# Scan through all the bounding boxes output from the network and keep only the
# ones with high confidence scores. Assign the box's class label as the class with the highest score.
classIds = []
confidences = []
boxes = []
for out in outs:
for detection in out:
scores = detection[5:]
classId = np.argmax(scores)
confidence = scores[classId]
if confidence > confThreshold:
center_x = int(detection[0] * frameWidth)
center_y = int(detection[1] * frameHeight)
width = int(detection[2] * frameWidth)
height = int(detection[3] * frameHeight)
left = int(center_x - width / 2)
top = int(center_y - height / 2)
boxes.append([left, top, width, height])
# Perform non maximum suppression to eliminate redundant overlapping boxes with
# lower confidences.
indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
for i in indices:
i = i[0]
box = boxes[i]
left = box[0]
top = box[1]
width = box[2]
height = box[3]
drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
# Process inputs
winName = 'Deep learning object detection in OpenCV'
cv.namedWindow(winName, cv.WINDOW_NORMAL)
outputFile = "yolo_out_py.avi"
if (args.image):
# Open the image file
if not os.path.isfile(args.image):
print("Input image file ", args.image, " doesn't exist")
cap = cv.VideoCapture(args.image)
outputFile = args.image[:-4]+'_yolo_out_py.jpg'
elif (args.video):
# Open the video file
if not os.path.isfile(args.video):
print("Input video file ", args.video, " doesn't exist")
cap = cv.VideoCapture(args.video)
outputFile = args.video[:-4]+'_yolo_out_py.avi'
# Webcam input
cap = cv.VideoCapture(0)
# Get the video writer initialized to save the output video
if (not args.image):
vid_writer = cv.VideoWriter(outputFile, cv.VideoWriter_fourcc('M','J','P','G'), 30, (round(cap.get(cv.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv.CAP_PROP_FRAME_HEIGHT))))
while cv.waitKey(1) < 0:
# get frame from the video
hasFrame, frame = cap.read()
# Stop the program if reached end of video
if not hasFrame:
print("Done processing !!!")
print("Output file is stored as ", outputFile)
# Release device
# Create a 4D blob from a frame.
blob = cv.dnn.blobFromImage(frame, 1/255, (inpWidth, inpHeight), [0,0,0], 1, crop=False)
# Sets the input to the network
# Runs the forward pass to get output of the output layers
outs = net.forward(getOutputsNames(net))
# Remove the bounding boxes with low confidence
postprocess(frame, outs)
# Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
t, _ = net.getPerfProfile()
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
# Write the frame with the detection boxes
if (args.image):
cv.imwrite(outputFile, frame.astype(np.uint8));
cv.imshow(winName, frame)
4 参考
