Yolov4性能分析(下)
Yolov4性能分析(下)
六. 权重更新
"darknet/src/detector.c"--train_detector()函数中:
......
/* 开始训练网络 */
float loss = 0;
#ifdef GPU
if (ngpus == 1) {
int wait_key = (dont_show) ? 0 : 1;
loss = train_network_waitkey(net, train, wait_key); // network.c中,train_network_waitkey函数入口,分配内存并执行网络训练。
}
else {
loss = train_networks(nets, ngpus, train, 4); // network_kernels.cu中,train_networks函数入口,多GPU训练。
}
#else
loss = train_network(net, train); // train_network_waitkey(net, d, 0),CPU模式。
#endif
if (avg_loss < 0 || avg_loss != avg_loss) avg_loss = loss; // if(-inf or nan)
avg_loss = avg_loss*.9 + loss*.1;
......
以CPU训练为例,"darknet/src/network.c"--train_network()函数,执行train_network_waitkey(net, d, 0):
float train_network_waitkey(network net, data d, int wait_key)
{
assert(d.X.rows % net.batch == 0);
int batch = net.batch; // detector.c中train_detector函数在nets[k] = parse_network_cfg(cfgfile)处调用parser.c中的parse_net_options函数,有net->batch /= subdivs,所以batch_size = batch/subdivisions。
int n = d.X.rows / batch; // batch个数, 对于单GPU和CPU,n = subdivision。
float* X = (float*)xcalloc(batch * d.X.cols, sizeof(float));
float* y = (float*)xcalloc(batch * d.y.cols, sizeof(float));
int i;
float sum = 0;
for(i = 0; i < n; ++i){
get_next_batch(d, batch, i*batch, X, y);
net.current_subdivision = i;
float err = train_network_datum(net, X, y); // 调用train_network_datum函数得到误差Loss。
sum += err;
if(wait_key) wait_key_cv(5);
}
(*net.cur_iteration) += 1;
#ifdef GPU
update_network_gpu(net);
#else // GPU
update_network(net);
#endif // GPU
free(X);
free(y);
return (float)sum/(n*batch);
}
其中,调用train_network_datum()函数计算error是核心:
float train_network_datum(network net, float *x, float *y)
{
#ifdef GPU
if(gpu_index >= 0) return train_network_datum_gpu(net, x, y); // GPU模式,调用network_kernels.cu中train_network_datum_gpu函数。
#endif
network_state state={0};
*net.seen += net.batch;
state.index = 0;
state.net = net;
state.input = x;
state.delta = 0;
state.truth = y;
state.train = 1;
forward_network(net, state); // CPU模式,正向传播。
backward_network(net, state); // CPU模式,BP。
float error = get_network_cost(net); // 计算Loss。
return error;
}
进一步分析forward_network()函数:
void forward_network(network net, network_state state)
{
state.workspace = net.workspace;
int i;
for(i = 0; i < net.n; ++i){
state.index = i;
layer l = net.layers[i];
if(l.delta && state.train){
scal_cpu(l.outputs * l.batch, 0, l.delta, 1); // blas.c中,scal_cpu函数入口。
}
l.forward(l, state); // 不同层l.forward代表不同函数,如:convolutional_layer.c中,l.forward = forward_convolutional_layer;yolo_layer.c中,l.forward = forward_yolo_layer,CPU执行前向运算。
state.input = l.output; // 上一层的输出传递给下一层的输入。
}
}
卷积层时,forward_convolutional_layer()函数:
void forward_convolutional_layer(convolutional_layer l, network_state state)
{
/* 获取卷积层输出的长宽。*/
int out_h =
convolutional_out_height(l);
int out_w =
convolutional_out_width(l);
int i, j;
fill_cpu(l.outputs*l.batch, 0, l.output, 1); // 把output初始化为0。
/* xnor-net,将inputs和weights二值化。*/
if (l.xnor
&& (!l.align_bit_weights || state.train)) {
if
(!l.align_bit_weights || state.train) {
binarize_weights(l.weights, l.n, l.nweights, l.binary_weights);
}
swap_binary(&l);
binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
state.input
= l.binary_input;
}
/* m是卷积核的个数,k是每个卷积核的参数数量(l.size是卷积核的大小),n是每个输出feature map的像素个数。*/
int m = l.n /
l.groups;
int k =
l.size*l.size*l.c / l.groups;
int n =
out_h*out_w;
static int u =
0;
u++;
for(i = 0; i
< l.batch; ++i)
{
for (j = 0;
j < l.groups; ++j)
{
/* weights是卷积核的参数,a是指向权重的指针,b是指向工作空间指针,c是指向输出的指针。*/
float
*a = l.weights +j*l.nweights / l.groups;
float
*b = state.workspace;
float *c = l.output +(i*l.groups +
j)*n*m;
if
(l.xnor && l.align_bit_weights && !state.train &&
l.stride_x == l.stride_y)
{
memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float));
if
(l.c % 32 == 0)
{
int ldb_align = l.lda_align;
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
int re_packed_input_size = l.c * l.w * l.h;
memset(state.workspace, 0, re_packed_input_size * sizeof(float));
const size_t new_c = l.c / 32;
size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
memset(l.bin_re_packed_input, 0, in_re_packed_input_size *
sizeof(uint32_t));
// float32x4 by channel (as in cuDNN)
repack_input(state.input, state.workspace, l.w, l.h, l.c);
// 32 x floats -> 1 x uint32_t
float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input,
l.c * l.w * l.h);
/* image to column,就是将图像依照卷积核的大小拉伸为列向量,方便矩阵运算,将图像每一个kernel转换成一列。*/
im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w,
l.size, l.stride, l.pad, state.workspace);
int new_k = l.size*l.size*l.c / 32;
transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input,
new_k, n, n, new_ldb);
/* General Matrix
Multiply函数,实现矩阵运算,也就是卷积运算。*/
gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned
char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c,
n, l.mean_arr);
}
else
{
im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride,
l.pad, state.workspace, l.bit_align);
// transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should
be multiple of 8 bits)
{
int ldb_align = l.lda_align;
size_t new_ldb = k + (ldb_align - k%ldb_align);
size_t t_intput_size = binary_transpose_align_input(k, n,
state.workspace, &l.t_bit_input, ldb_align, l.bit_align);
// 5x times faster than gemm()-float32
gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned
char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c,
n, l.mean_arr);
}
}
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); //添加偏移项。
/* 非线性变化,leaky RELU、Mish等激活函数。*/
if
(l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch,
l.activation_input, l.output);
else if (l.activation == MISH) activate_array_mish(l.output,
l.outputs*l.batch, l.activation_input, l.output);
else if (l.activation == NORM_CHAN)
activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch,
l.out_c, l.out_w*l.out_h, l.output);
else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output,
l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 0);
else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL)
activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch,
l.out_c, l.out_w*l.out_h, l.output, 1);
else activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
return;
}
else {
float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w;
if
(l.size == 1) {
b = im;
}
else {
im2col_cpu_ext(im, // input
l.c / l.groups, // input channels
l.h, l.w, // input size
(h, w)
l.size, l.size, // kernel size
(h, w)
l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)
l.stride_y, l.stride_x, // stride (h, w)
l.dilation, l.dilation, // dilation (h, w)
b); // output
}
gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
//
bit-count to float
}
}
}
if(l.batch_normalize){ // BN层,加速收敛。
forward_batchnorm_layer(l, state);
}
else { // 直接加上bias,output += bias。
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
}
/* 非线性变化,leaky RELU、Mish等激活函数。*/
if
(l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch,
l.activation_input, l.output);
else if
(l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch,
l.activation_input, l.output);
else if
(l.activation == NORM_CHAN) activate_array_normalize_channels(l.output,
l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
else if
(l.activation == NORM_CHAN_SOFTMAX)
activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch,
l.out_c, l.out_w*l.out_h, l.output, 0);
else if
(l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax(l.output,
l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 1);
else
activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
if(l.binary ||
l.xnor) swap_binary(&l); // 二值化。
if(l.assisted_excitation && state.train)
assisted_excitation_forward(l, state);
if
(l.antialiasing) {
network_state s = { 0 };
s.train =
state.train;
s.workspace
= state.workspace;
s.net =
state.net;
s.input =
l.output;
forward_convolutional_layer(*(l.input_layer),
s);
memcpy(l.output, l.input_layer->output, l.input_layer->outputs *
l.input_layer->batch * sizeof(float));
}
}
yolo层时,forward_yolo_layer()函数:
void forward_yolo_layer(const layer l, network_state
state)
{
int i, j, b, t,
n;
memcpy(l.output, state.input, l.outputs*l.batch * sizeof(float)); // 将层输入直接copy到层输出。
/* 在cpu模式,把预测输出的x,y,confidence和所有类别都sigmoid激活,确保值在0~1之间。*/
#ifndef GPU
for (b = 0; b
< l.batch; ++b) {
for (n = 0;
n < l.n; ++n) {
int
index = entry_index(l, b, n*l.w*l.h, 0); // 获取第b个batch开始的index。
/* 对预测的tx,ty进行逻辑回归。*/
activate_array(l.output + index, 2 * l.w*l.h, LOGISTIC); // x,y,
scal_add_cpu(2 * l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output
+ index, 1); // scale x,y
index =
entry_index(l, b, n*l.w*l.h, 4); // 获取第b个batch confidence开始的index。
activate_array(l.output + index, (1 + l.classes)*l.w*l.h, LOGISTIC); // 对预测的confidence以及class进行逻辑回归。
}
}
#endif
// delta is
zeroed
memset(l.delta,
0, l.outputs * l.batch * sizeof(float)); // 将yolo层的误差项进行初始化(包含整个batch的)。
if
(!state.train) return; // 不是训练阶段,return。
float tot_iou =
0; // 总的IOU。
float tot_giou
= 0;
float tot_diou
= 0;
float tot_ciou
= 0;
float
tot_iou_loss = 0;
float
tot_giou_loss = 0;
float
tot_diou_loss = 0;
float
tot_ciou_loss = 0;
float recall =
0;
float recall75
= 0;
float avg_cat =
0;
float avg_obj =
0;
float
avg_anyobj = 0;
int count = 0;
int class_count
= 0;
*(l.cost) = 0;
// yolo层的总损失初始化为0。
for (b = 0; b
< l.batch; ++b) { // 遍历batch中的每一张图片。
for (j = 0;
j < l.h; ++j) {
for (i
= 0; i < l.w; ++i) { // 遍历每个Grid
cell, 当前cell编号[j, i]。
for
(n = 0; n < l.n; ++n) { // 遍历每一个bbox,当前bbox编号[n]。
const int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
// 预测b-box类别s下标。 const int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
// 预测b-box objectness下标。
const int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); // 获得第j*w+i个cell第n个b-box的index。
const int stride = l.w*l.h;
/* 计算第j*w+i个cell第n个b-box在当前特征图上的相对位置[x,y],在网络输入图片上的相对宽度、高度[w,h]。*/
box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j,
l.w, l.h, state.net.w, state.net.h, l.w*l.h);
float best_match_iou = 0;
int best_match_t = 0;
float best_iou = 0; // 保存最大IOU。
int best_t = 0; // 保存最大IOU的bbox id。
for (t = 0; t < l.max_boxes; ++t) { // 遍历每一个GT bbox。
box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths,
1); // 将第t个bbox由float数组转bbox结构体,方便计算IOU。
int class_id = state.truth[t*(4 + 1) + b*l.truths + 4]; // 获取第t个bbox的类别,检查是否有标注错误。
if (class_id >= l.classes || class_id < 0) {
printf("\n
Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels
class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes -
1);
printf("\n
truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n",
truth.x, truth.y, truth.w, truth.h, class_id);
if (check_mistakes)
getchar();
continue; // if
label contains class_id more than number of classes in the cfg-file and
class_id check garbage value
}
if (!truth.x) break; // 如果x坐标为0则break,因为定义了max_boxes个b-box。
float objectness = l.output[obj_index]; // 预测bbox object置信度。
if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0;
/* 获得预测b-box的类别信息,如果某个类别的概率超过0.25返回1。*/
int class_id_match = compare_yolo_class(l.output, l.classes,
class_index, l.w*l.h, objectness, class_id, 0.25f);
float iou = box_iou(pred, truth); // 计算pred b-box与第t个GT
bbox之间的IOU。
if (iou > best_match_iou && class_id_match == 1) { //
class_id_match=1的限制,即预测b-box的置信度必须大于0.25。
best_match_iou = iou;
best_match_t = t;
}
if (iou > best_iou) {
best_iou = iou; // 更新最大的IOU。
best_t = t; // 记录该GT b-box的编号t。
}
}
avg_anyobj += l.output[obj_index]; // 统计pred b-box的confidence。
l.delta[obj_index] = l.cls_normalizer * (0 - l.output[obj_index]); // 将所有pred b-box都当做noobject, 计算其confidence梯度,cls_normalizer是平衡系数。
if (best_match_iou > l.ignore_thresh) { // best_iou大于阈值则说明pred box有物体。
const float iou_multiplier = best_match_iou*best_match_iou;//
(best_match_iou - l.ignore_thresh) / (1.0 - l.ignore_thresh);
if (l.objectness_smooth) {
l.delta[obj_index]
= l.cls_normalizer * (iou_multiplier - l.output[obj_index]);
int class_id =
state.truth[best_match_t*(4 + 1) + b*l.truths + 4];
if (l.map) class_id
= l.map[class_id];
const float
class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] :
1.0f;
l.delta[class_index
+ stride*class_id] = class_multiplier * (iou_multiplier - l.output[class_index
+ stride*class_id]);
}
else l.delta[obj_index] = 0;
}
else if (state.net.adversarial) { // 自对抗训练。
int stride = l.w*l.h;
float scale = pred.w * pred.h;
if (scale > 0) scale = sqrt(scale);
l.delta[obj_index] = scale *
l.cls_normalizer * (0 - l.output[obj_index]);
int cl_id;
for (cl_id = 0; cl_id < l.classes; ++cl_id) {
if(l.output[class_index + stride*cl_id] * l.output[obj_index] > 0.25)
l.delta[class_index + stride*cl_id] = scale * (0 - l.output[class_index
+ stride*cl_id]);
}
}
if (best_iou > l.truth_thresh) { // pred b-box为完全预测正确样本,cfg中truth_thresh=1,语句永远不可能成立。
const float iou_multiplier = best_iou*best_iou;// (best_iou -
l.truth_thresh) / (1.0 - l.truth_thresh);
if (l.objectness_smooth) l.delta[obj_index] = l.cls_normalizer *
(iou_multiplier - l.output[obj_index]);
else l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
int class_id = state.truth[best_t*(4 + 1) + b*l.truths + 4];
if (l.map) class_id = l.map[class_id];
delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes,
l.w*l.h, 0, l.focal_loss, l.label_smooth_eps, l.classes_multipliers);
const float class_multiplier = (l.classes_multipliers) ?
l.classes_multipliers[class_id] : 1.0f;
if (l.objectness_smooth) l.delta[class_index + stride*class_id] =
class_multiplier * (iou_multiplier - l.output[class_index + stride*class_id]);
box truth = float_to_box_stride(state.truth + best_t*(4 + 1) +
b*l.truths, 1);
delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j,
l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h,
l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);
}
}
}
}
for (t = 0;
t < l.max_boxes; ++t) { // 遍历每一个GT box。
box
truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1); // 将第t个b-box由float数组转b-box结构体,方便计算IOU。
if
(truth.x < 0 || truth.y < 0 || truth.x > 1 || truth.y > 1 ||
truth.w < 0 || truth.h < 0) {
char buff[256];
printf(" Wrong label: truth.x = %f, truth.y = %f, truth.w = %f,
truth.h = %f \n", truth.x, truth.y, truth.w, truth.h);
sprintf(buff, "echo \"Wrong label: truth.x = %f, truth.y = %f,
truth.w = %f, truth.h = %f\" >> bad_label.list",
truth.x, truth.y, truth.w, truth.h);
system(buff);
}
int
class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
if
(class_id >= l.classes || class_id < 0) continue; // if label contains
class_id more than number of classes in the cfg-file and class_id check garbage
value
if
(!truth.x) break; // 如果x坐标为0则取消,定义了max_boxes个bbox,可能实际上没那么多。
float
best_iou = 0; // 保存最大的IOU。
int
best_n = 0; // 保存最大IOU的b-box
index。
i =
(truth.x * l.w); // 获得当前t个GT b-box所在的cell。
j =
(truth.y * l.h);
box
truth_shift = truth;
truth_shift.x = truth_shift.y = 0; // 将truth_shift的box位置移动到0,0。
for (n
= 0; n < l.total; ++n) { // 遍历每一个anchor
b-box找到与GT b-box最大的IOU。
box
pred = { 0 };
pred.w = l.biases[2 * n] / state.net.w; // 计算pred b-box的w在相对整张输入图片的位置。
pred.h = l.biases[2 * n + 1] / state.net.h; // 计算pred bbox的h在相对整张输入图片的位置。
float iou = box_iou(pred, truth_shift); // 计算GT box truth_shift与预测b-box pred二者之间的IOU。
if
(iou > best_iou) {
best_iou = iou; // 记录最大的IOU。
best_n = n; // 记录该b-box的编号n。
}
}
int
mask_n = int_index(l.mask, best_n, l.n); // 上面记录b-box的编号,是否由该层Anchor预测的。
if
(mask_n >= 0) {
int
class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
if
(l.map) class_id = l.map[class_id];
int
box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0); // 获得best_iou对应anchor box的index。
const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id]
: 1.0f; // 控制样本数量不均衡,即Focal Loss中的alpha。
ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n,
box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 -
truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1,
l.max_delta); // 计算best_iou对应Anchor bbox的[x,y,w,h]的梯度。
/* 模板检测最新的工作,metricl learning,包括IOU/GIOU/DIOU/CIOU Loss等。*/
//
range is 0 <= 1
tot_iou += all_ious.iou;
tot_iou_loss
+= 1 - all_ious.iou;
//
range is -1 <= giou <= 1
tot_giou += all_ious.giou;
tot_giou_loss += 1 - all_ious.giou;
tot_diou += all_ious.diou;
tot_diou_loss += 1 - all_ious.diou;
tot_ciou += all_ious.ciou;
tot_ciou_loss += 1 - all_ious.ciou;
int
obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4); // 获得best_iou对应anchor box的confidence的index。
avg_obj += l.output[obj_index]; // 统计confidence。
l.delta[obj_index] = class_multiplier * l.cls_normalizer * (1 -
l.output[obj_index]); // 计算confidence的梯度。
int
class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1); // 获得best_iou对应GT box的class的index。
delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes,
l.w*l.h, &avg_cat, l.focal_loss, l.label_smooth_eps,
l.classes_multipliers); // 获得best_iou对应anchor box的class的index。
++count;
++class_count;
if
(all_ious.iou > .5) recall += 1;
if
(all_ious.iou > .75) recall75 += 1;
}
//
iou_thresh
for (n
= 0; n < l.total; ++n) {
int
mask_n = int_index(l.mask, n, l.n);
if
(mask_n >= 0 && n != best_n && l.iou_thresh < 1.0f) {
box pred = { 0 };
pred.w = l.biases[2 * n] / state.net.w;
pred.h = l.biases[2 * n + 1] / state.net.h;
float iou = box_iou_kind(pred, truth_shift, l.iou_thresh_kind); // IOU,
GIOU, MSE, DIOU, CIOU
// iou, n
if (iou > l.iou_thresh) {
int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
if (l.map) class_id = l.map[class_id];
int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
const float class_multiplier =
(l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
ious all_ious = delta_yolo_box(truth, l.output, l.biases, n, box_index,
i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h),
l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);
// range is 0 <= 1
tot_iou += all_ious.iou;
tot_iou_loss += 1 - all_ious.iou;
// range is -1 <= giou <= 1
tot_giou += all_ious.giou;
tot_giou_loss += 1 - all_ious.giou;
tot_diou += all_ious.diou;
tot_diou_loss += 1 - all_ious.diou;
tot_ciou += all_ious.ciou;
tot_ciou_loss += 1 - all_ious.ciou;
int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
avg_obj += l.output[obj_index];
l.delta[obj_index] = class_multiplier * l.cls_normalizer * (1 -
l.output[obj_index]);
int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes,
l.w*l.h, &avg_cat, l.focal_loss, l.label_smooth_eps,
l.classes_multipliers);
++count;
++class_count;
if (all_ious.iou > .5) recall += 1;
if (all_ious.iou > .75) recall75 += 1;
}
}
}
}
// averages
the deltas obtained by the function: delta_yolo_box()_accumulate
for (j = 0;
j < l.h; ++j) {
for (i
= 0; i < l.w; ++i) {
for
(n = 0; n < l.n; ++n) {
int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); // 获得第j*w+i个cell第n个b-box的index。
int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1); // 获得第j*w+i个cell第n个b-box的类别。
const int stride = l.w*l.h; // 特征图的大小。
averages_yolo_deltas(class_index, box_index, stride, l.classes,
l.delta); // 对梯度进行平均。
}
}
}
}
......
// gIOU loss + MSE (objectness) loss
if (l.iou_loss == MSE) {
*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch),
2);
}
else {
// Always compute classification loss both for iou + cls
loss and for logging with mse loss
// TODO: remove IOU loss fields before computing MSE on
class
// probably split into two arrays
if (l.iou_loss == GIOU) {
avg_iou_loss = count > 0 ? l.iou_normalizer *
(tot_giou_loss / count) : 0; // 平均IOU损失,参考上面代码,tot_iou_loss += 1 -
all_ious.iou。
}
else {
avg_iou_loss = count > 0 ? l.iou_normalizer *
(tot_iou_loss / count) : 0; // 平均IOU损失,参考上面代码,tot_iou_loss += 1 - all_ious.iou。
}
*(l.cost) = avg_iou_loss + classification_loss; // Loss值传递给l.cost,IOU与分类损失求和。
}
loss /= l.batch; // 平均Loss。
classification_loss /= l.batch;
iou_loss /= l.batch;
......
}
再来分析backward_network()函数:
void backward_network(network net, network_state
state)
{
int i;
float *original_input = state.input;
float *original_delta = state.delta;
state.workspace = net.workspace;
for(i = net.n-1; i >= 0; --i){
state.index = i;
if(i == 0){
state.input
= original_input;
state.delta
= original_delta;
}else{
layer prev =
net.layers[i-1];
state.input
= prev.output;
state.delta
= prev.delta; // delta是指针变量,对state.delta做修改,就相当与对prev层的delta做了修改。
}
layer l = net.layers[i];
if (l.stopbackward) break;
if (l.onlyforward) continue;
l.backward(l, state); // 不同层l.backward代表不同函数,如:convolutional_layer.c中,l.backward = backward_convolutional_layer;yolo_layer.c中,l.backward = backward_yolo_layer,CPU执行反向传播。
}
}
卷积层时,backward_convolutional_layer()函数:
void backward_convolutional_layer(convolutional_layer l,
network_state state)
{
int i, j;
/* m是卷积核的个数,k是每个卷积核的参数数量(l.size是卷积核的大小),n是每个输出feature map的像素个数。*/
int m = l.n /
l.groups;
int n =
l.size*l.size*l.c / l.groups;
int k =
l.out_w*l.out_h;
/* 更新delta。*/
if
(l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch,
l.activation_input, l.delta);
else if
(l.activation == MISH) gradient_array_mish(l.outputs*l.batch,
l.activation_input, l.delta);
else if
(l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL)
gradient_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch,
l.out_c, l.out_w*l.out_h, l.delta);
else if
(l.activation == NORM_CHAN) gradient_array_normalize_channels(l.output,
l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
else
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
if (l.batch_normalize)
{ // BN层,加速收敛。
backward_batchnorm_layer(l, state);
}
else { // 直接加上bias。
backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
}
for (i = 0; i
< l.batch; ++i) {
for (j = 0;
j < l.groups; ++j) {
float
*a = l.delta + (i*l.groups + j)*m*k;
float
*b = state.workspace;
float
*c = l.weight_updates + j*l.nweights / l.groups;
/* 进入本函数之前,在backward_network()函数中,已经将net.input赋值为prev.output,若当前层为第l层,则net.input为第l-1层的output。*/
float
*im = state.input + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w;
im2col_cpu_ext(
im, // input
l.c
/ l.groups, // input channels
l.h, l.w, // input size (h, w)
l.size, l.size, // kernel size
(h, w)
l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)
l.stride_y, l.stride_x, // stride (h, w)
l.dilation, l.dilation, // dilation (h, w)
b); // output
gemm(0,
1, m, n, k, 1, a, k, b, k, 1, c, n); // 计算当前层weights更新。
/* 计算上一层的delta,进入本函数之前,在backward_network()函数中,已经将net.delta赋值为prev.delta,若当前层为第l层,则net.delta为第l-1层的delta。*/
if
(state.delta) {
a =
l.weights + j*l.nweights / l.groups;
b =
l.delta + (i*l.groups + j)*m*k;
c =
state.workspace;
gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
col2im_cpu_ext(
state.workspace, // input
l.c / l.groups, // input
channels (h, w)
l.h, l.w, // input
size (h, w)
l.size, l.size, // kernel
size (h, w)
l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)
l.stride_y, l.stride_x, //
stride (h, w)
l.dilation, l.dilation, // dilation (h, w)
state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output
(delta)
}
}
}
}
yolo层时,backward_yolo_layer()函数:
void backward_yolo_layer(const layer l, network_state
state)
{
axpy_cpu(l.batch*l.inputs,
1, l.delta, 1, state.delta, 1); // 直接把l.delta拷贝给上一层的delta。注意 net.delta 指向 prev_layer.delta。
}
正向、反向传播后,通过get_network_cost()函数计算Loss:
float get_network_cost(network net)
{
int i;
float sum = 0;
int count = 0;
for(i = 0; i
< net.n; ++i){
if(net.layers[i].cost){ // 获取各层的损失,只有detection层,也就是yolo层,有cost。
sum +=
net.layers[i].cost[0]; // Loss总和存在cost[0]中,见cost_layer.c中forward_cost_layer()函数。
++count;
}
}
return sum/count;
// 返回平均损失。
}
CIOU_Loss是创新点,与GIOU_Loss相比,引入了重叠面积与中心点的距离Dis_2来区分预测框a与b的定位差异,同时还引入了预测框和目标框的长宽比一致性因子ν,将a与c这种重叠面积与中心点距离相同但长宽比与目标框适配程度有差异的预测框区分开来,如图:
计算好Loss需要update_network():
void
update_network(network net)
{
int i;
int update_batch = net.batch*net.subdivisions;
float rate = get_current_rate(net);
for(i = 0; i < net.n; ++i){
layer l = net.layers[i];
if(l.update){
l.update(l,
update_batch, rate, net.momentum, net.decay); // convolutional_layer.c中,l.update = update_convolutional_layer。
}
}
}
update_convolutional_layer()函数:
void
update_convolutional_layer(convolutional_layer l, int batch, float
learning_rate_init, float momentum, float decay)
{
float learning_rate =
learning_rate_init*l.learning_rate_scale;
axpy_cpu(l.nweights, -decay*batch,
l.weights, 1, l.weight_updates, 1); // blas.c中,axpy_cpu函数入口,for(i = 0; i < l.nweights; ++i),l.weight_updates[i*1] -= decay*batch*l.weights[i*1]。
axpy_cpu(l.nweights, learning_rate / batch,
l.weight_updates, 1, l.weights, 1); // for(i = 0; i < l.nweights; ++i),l.weights[i*1] +=
(learning_rate/batch)*l.weight_updates[i*1]
scal_cpu(l.nweights, momentum,
l.weight_updates, 1); // blas.c中,scal_cpu函数入口,for(i = 0; i <
l.nweights; ++i),l.weight_updates[i*1] *= momentum。
axpy_cpu(l.n, learning_rate / batch,
l.bias_updates, 1, l.biases, 1); // for(i = 0; i < l.n; ++i),l.biases[i*1] +=
(learning_rate/batch)*l.bias_updates[i*1]。
scal_cpu(l.n, momentum, l.bias_updates, 1);
// for(i = 0; i < l.n; ++i),l.bias_updates[i*1] *= momentum。
if (l.scales) {
axpy_cpu(l.n, learning_rate / batch,
l.scale_updates, 1, l.scales, 1);
scal_cpu(l.n, momentum,
l.scale_updates, 1);
}
}
同样,在network_kernels.cu里,有GPU模式下的forward&backward相关的函数,涉及数据格式转换及加速,此处只讨论原理,暂时忽略GPU部分的代码。
void
forward_backward_network_gpu(network net, float *x, float *y)
{
......
forward_network_gpu(net, state); // 正向。
backward_network_gpu(net, state); // 反向。
......
}
CPU模式下,采用带momentum的常规GD更新weights,同时在network.c中也提供了也提供了train_network_sgd()函数接口;GPU模式提供了adam选项,convolutional_layer.c中make_convolutional_layer()函数有体现。
七. 调参总结
本人在实际项目中涉及的是工业中的钢铁表面缺陷检测场景,不到2000张图片,3类,数据量很少。理论上YOLO系列并不太适合缺陷检测的问题,基于分割+分类的网络、Cascade-RCNN等或许是更好的选择,但我本着实验的态度,进行了多轮的训练和对比,整体上效果还是不错的。
1.max_batches: AlexeyAB在github工程上有提到,类别数*2000作为参考,不要少于6000,但这个是使用预训练权重的情况。如果train from scratch,要适当增加,具体要看你的数据情况,网络需要额外的时间来从零开始学习;
2.pretrain or not:当数据量很少时,预训练确实能更快使模型收敛,效果也不错,但缺陷检测这类问题,缺陷目标特征本身的特异性还是比较强的,虽然我的数据量也很少,但scratch的方式还是能取得稍好一些的效果;
3.anchors:cfg文件默认的anchors是基于COCO数据集,可以说尺度比较均衡,使用它效果不会差,但如果你自己的数据在尺度分布上不太均衡,建议自行生成新的anchors,可以直接使用源码里面的脚本,注意,要根据生成anchors的size(1-yolo:<30*30,2-yolo:<60*60,3-yolo:others)来改变索引值masks以及前一个conv层的filters参数;
4.rotate:YOLO-V4在目标检测这一块,其实没有用到旋转来进行数据增强,因此我在线下对数量最少的一个类进行了180旋转对称增强,该类样本数扩增一倍,效果目前还不明显,可能是数据量增加的还是太少,而且我还在训练对比,完成后可以补充;
5.mosaic:马赛克数据增强是必须要有的,mAP值提升比较明显,需要安装opencv,且和cutmix不能同时使用。
- Draw object:
#if defined(OPENCV) && defined(GPU)
read_data_cfg
option_find_str
get_labels_custom
load_alphabet
parse_network_cfg
parse_network_cfg_custom
set_batch_network
load_weights
load_image
resize_image
copy_image
cv_draw_object
basecfg
draw_train_chart
forward_backward_network_gpu
draw_train_loss
crop_image
copy_image_inplace
embed_image
show_image_cv
quantize_image
network_predict
save_image_png
get_network_boxes
do_nms_sort
diounms_sort
draw_detections_v3
save_image
- calc_anchors
read_data_cfg
option_find_str
get_paths
list_to_array
option_find_int
replace_image_to_label
read_boxes
counter_per_class
calculating k-means++
make_matrix
do_kmeans
show_anhors
- validate_detector_recall
parse_network_cfg_custom
load_weights
fuse_conv_batchnorm
load_image
resize_image
basecfg
network_predict
get_network_boxes
do_nms_obj
replace_image_to_label
read_boxes
- validate_detector_map
read_data_cfg
option_find_str
get_labels_custom
read_map
remember_network_recurrent_state
free_network_recurrent_state
parse_network_cfg_custom
load_weights
fuse_conv_batchnorm
calculate_binary_weights
get_paths
list_to_array
// For multi-class precision and recall computation
load_data_in_thread
pthread_join
load_data_in_thread
basecfg
network_predict
get_network_boxes
do_nms_sort
diounms_sort
set_track_id
replace_image_to_label
read_boxes
SORT(detections)
// for PR-curve
// correct mAP calculation: ImageNet, PascalVOC
2010-2012
//add remaining area of PR curve when recall isn't 0
at rank-1
// free memory
restore_network_recurrent_state
return mean_average_precision;
- train_detector
read_data_cfg
option_find_str
cuda_set_device
parse_network_cfg_custom
get_labels_custom
basecfg
cuda_set_device
parse_network_cfg
get_current_iteration
draw_train_chart
load_data
rand_scale
// at the beginning (check if enough memory) and at
the end (calc rolling mean/variance)
pthread_join
load_data
resize_network
pthread_join
load_data
float_to_box
float_to_image
train_network_waitkey
train_networks
get_current_iteration
// calculate mAP for each 4 Epochs
resize_network
copy_weights_net
// combine Training and Validation networks
draw_train_loss
sync_nets
save_weights
// free memory
//free_network(net);
- test_detector
read_data_cfg
option_find_str
get_labels_custom
parse_network_cfg_custom
load_weights
fuse_conv_batchnorm(net);
calculate_binary_weights(net);
letterbox_image
resize_image
get_network_boxes
do_nms_sort
diounms_sort
draw_detections_v3
save_image
show_image
detection_to_json
replace_image_to_label
// free memory
- demo
parse_network_cfg_custom
load_weights
fuse_conv_batchnorm(net);
calculate_binary_weights(net);
get_capture_video_stream
get_capture_webcam
custom_create_thread
fetch_in_thread_sync
detect_in_thread_sync
create_window_cv
create_video_writer
get_time_point
custom_atomic_store_int
do_nms_obj
diounms_sort
set_track_id
send_json
send_http_post_request
draw_detections_cv_v3
max_val_cmp
send_mjpeg
write_frame_cv
this_thread_yield
release_video_writer
//free memory and thread
Coco依赖的软件:coco.data,yolov4.cfg,yolov4.weights
- duration_make_convolutional_layer:
336607
convolutional_layer make_convolutional_layer(int
batch, int steps, int h, int w, int c, int n, int groups, int size, int
stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int
batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index,
int antialiasing, convolutional_layer *share_layer, int assisted_excitation,
int deform, int train)
1) Preprocessing:Blur,antialiasing
2) cuda_make_array
3) get_convolutional_workspace_size
4) make_convolutional_layer
5) push_convolutional_layer,read_weights,
- duration_run_detector_demo:
339565723
1) read_data_cfg(datacfg);
2) option_find_int(options,
"classes", 20);
3) option_find_str(options,
"names", "data/names.list");
4) get_labels(name_list);
5) demo(cfg,
weights, thresh, hier_thresh, cam_index, filename, names, classes, avgframes,
frame_skip, prefix, out_filename,
mjpeg_port, dontdraw_bbox, json_port, dont_show, ext_output, letter_box,
time_limit_sec, http_post_host, benchmark, benchmark_layers);
- duration_main_run_detector:
339565785
run_detector(argc, argv);
1) find_arg,
find_int_arg, find_char_arg, find_float_arg
2) test_detector,
3) train_detector,
4) validate_detector,
validate_detector_recall, validate_detector_map
5) calc_anchors
6) draw_object
7) demo(read_data_cfg,
option_find_int, option_find_str, get_labels, free_list_contents_kvp,
free_list, free(gpus))
- duration_make_yolo_layer: 5110
make_yolo_layer
1) forward_yolo_layer;
activate_array,scal_add_cpu,entry_index,get_yolo_box,float_to_box_stride,compare_yolo_class,box_iou,delta_yolo_box,delta_yolo_box()_accumulate,averages_yolo_deltas,compute classification loss
2) backward_yolo_layer;
backward_yolo_layer_gpu;
axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
axpy_ongpu(l.batch*l.inputs, state.net.loss_scale * l.delta_normalizer, l.delta_gpu,
1, state.delta, 1);
3)cuda_make_array
4)cudaHostAlloc
5. duration_parse_network_cfg_custom:
2007
:parse_network_cfg_custom(char *filename, int batch,
int time_steps)
1) read_cfg(filename);
2) make_network(sections->size
- 1);
3) parse_net_options(options,
&net);
4) pre_allocate_pinned_memory((size_t)1024
* 1024 * 1024 * 8); // pre-allocate 8
GB CPU-RAM for pinned memory
5) parse_convolutional(options,
params);
parse_local(options, params);
parse_activation(options, params);
parse_rnn(options, params);
parse_gru(options, params);
parse_lstm(options, params);
parse_conv_lstm(options, params);
parse_history(options, params);
parse_crnn(options, params);
parse_connected(options,
params);
parse_crop(options, params);
parse_cost(options, params);
parse_region(options, params);
parse_yolo(options, params);
parse_gaussian_yolo(options, params);
parse_detection(options, params);
parse_softmax(options, params);
parse_contrastive(options, params);
parse_normalization(options, params);
parse_batchnorm(options, params);
parse_maxpool(options, params);
parse_local_avgpool(options, params);
parse_reorg(options, params);
}
parse_reorg_old(options, params);
parse_avgpool(options, params);
parse_route(options, params);
parse_upsample(options, params, net);
parse_shortcut(options, params, net);
parse_scale_channels(options, params, net);
parse_sam(options, params, net);
parse_dropout(options, params);
cuda_make_array_pinned
cuda_make_array_pinned_preallocated
set_specified_workspace_limit
cuda_make_array:cuda_pull_array_async,activate_array_ongpu
get_network_output
CHECK_CUDA
Coco依赖的软件:coco.data,yolov4.cfg,yolov4.conv.137,trainvalueno5k.txt,train2014
read_data_cfg
option_find_str
open_valid_file
cuda_set_device
parse_network_cfg_custom
get_labels_custom
basecfg
parse_network_cfg
get_paths(train_images)
list_to_array(plist)
get_current_iteration(net)
draw_train_chart
load_data
rand_scale(rand_coef);
pthread_join
float_to_box
float_to_image
compute_loss
train_network_waitkey
train_networks
free_data
resize_network
validate_detector_map
save_weights
draw_train_loss
sync_nets
release_mat(&img);
destroy_all_windows_cv();
// free
memory
pthread_join(load_thread, 0);
free_data(buffer);
free_load_threads(&args);
free(base);
free(paths);
free_list_contents(plist);
free_list(plist);
free_list_contents_kvp(options);
free_list(options);
free_network;
free(nets);
free_network(net_map);
Makefile
GPU=0
CUDNN=0
CUDNN_HALF=0
OPENCV=0
AVX=0
OPENMP=0
LIBSO=0
ZED_CAMERA=0
ZED_CAMERA_v2_8=0
# set GPU=1 and CUDNN=1 to speedup on GPU
# set CUDNN_HALF=1 to further speedup 3 x times
(Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing and higher
# set AVX=1 and OPENMP=1 to speedup on CPU (if error
occurs then set AVX=0)
# set ZED_CAMERA=1 to enable ZED SDK 3.0 and above
# set ZED_CAMERA_v2_8=1 to enable ZED SDK 2.X
USE_CPP=0
DEBUG=0
ARCH= -gencode arch=compute_30,code=sm_30 \
-gencode
arch=compute_35,code=sm_35 \
-gencode
arch=compute_50,code=[sm_50,compute_50] \
-gencode
arch=compute_52,code=[sm_52,compute_52] \
-gencode
arch=compute_61,code=[sm_61,compute_61]
OS := $(shell uname)
# Tesla A100 (GA100), DGX-A100, RTX 3080
# ARCH= -gencode
arch=compute_80,code=[sm_80,compute_80]
# Tesla V100
# ARCH= -gencode
arch=compute_70,code=[sm_70,compute_70]
# GeForce RTX 2080 Ti, RTX 2080, RTX 2070, Quadro RTX
8000, Quadro RTX 6000, Quadro RTX 5000, Tesla T4, XNOR Tensor Cores
# ARCH= -gencode
arch=compute_75,code=[sm_75,compute_75]
# Jetson XAVIER
# ARCH= -gencode
arch=compute_72,code=[sm_72,compute_72]
# GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030,
Titan Xp, Tesla P40, Tesla P4
# ARCH= -gencode arch=compute_61,code=sm_61 -gencode
arch=compute_61,code=compute_61
# GP100/Tesla P100 - DGX-1
# ARCH= -gencode arch=compute_60,code=sm_60
# For Jetson TX1, Tegra X1, DRIVE CX, DRIVE PX -
uncomment:
# ARCH= -gencode
arch=compute_53,code=[sm_53,compute_53]
# For Jetson Tx2 or Drive-PX2 uncomment:
# ARCH= -gencode
arch=compute_62,code=[sm_62,compute_62]
VPATH=./src/
EXEC=darknet
OBJDIR=./obj/
ifeq ($(LIBSO), 1)
LIBNAMESO=libdarknet.so
APPNAMESO=uselib
endif
ifeq ($(USE_CPP), 1)
CC=g++
else
CC=gcc
endif
CPP=g++ -std=c++11
NVCC=nvcc
OPTS=-Ofast
LDFLAGS= -lm -pthread
COMMON= -Iinclude/ -I3rdparty/stb/include
CFLAGS=-Wall -Wfatal-errors -Wno-unused-result
-Wno-unknown-pragmas -fPIC
ifeq ($(DEBUG), 1)
#OPTS= -O0 -g
#OPTS= -Og -g
COMMON+= -DDEBUG
CFLAGS+= -DDEBUG
else
ifeq ($(AVX), 1)
CFLAGS+= -ffp-contract=fast -mavx -mavx2 -msse3
-msse4.1 -msse4.2 -msse4a
endif
endif
CFLAGS+=$(OPTS)
ifneq (,$(findstring MSYS_NT,$(OS)))
LDFLAGS+=-lws2_32
endif
ifeq ($(OPENCV), 1)
COMMON+= -DOPENCV
CFLAGS+= -DOPENCV
LDFLAGS+= `pkg-config --libs opencv4 2> /dev/null
|| pkg-config --libs opencv`
COMMON+= `pkg-config --cflags opencv4 2> /dev/null
|| pkg-config --cflags opencv`
endif
ifeq ($(OPENMP), 1)
ifeq
($(OS),Darwin) #MAC
CFLAGS+= -Xpreprocessor -fopenmp
else
CFLAGS+=
-fopenmp
endif
LDFLAGS+= -lgomp
endif
ifeq ($(GPU), 1)
COMMON+= -DGPU -I/usr/local/cuda/include/
CFLAGS+= -DGPU
ifeq ($(OS),Darwin) #MAC
LDFLAGS+= -L/usr/local/cuda/lib -lcuda -lcudart
-lcublas -lcurand
else
LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart
-lcublas -lcurand
endif
endif
ifeq ($(CUDNN), 1)
COMMON+= -DCUDNN
ifeq ($(OS),Darwin) #MAC
CFLAGS+= -DCUDNN -I/usr/local/cuda/include
LDFLAGS+= -L/usr/local/cuda/lib -lcudnn
else
CFLAGS+= -DCUDNN -I/usr/local/cudnn/include
LDFLAGS+= -L/usr/local/cudnn/lib64 -lcudnn
endif
endif
ifeq ($(CUDNN_HALF), 1)
COMMON+= -DCUDNN_HALF
CFLAGS+= -DCUDNN_HALF
ARCH+= -gencode
arch=compute_70,code=[sm_70,compute_70]
endif
ifeq ($(ZED_CAMERA), 1)
CFLAGS+= -DZED_STEREO -I/usr/local/zed/include
ifeq ($(ZED_CAMERA_v2_8), 1)
LDFLAGS+= -L/usr/local/zed/lib -lsl_core -lsl_input
-lsl_zed
#-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0
else
LDFLAGS+= -L/usr/local/zed/lib -lsl_zed
#-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0
endif
endif
OBJ=image_opencv.o http_stream.o gemm.o utils.o
dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o
col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o
data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o
darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o
normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o
compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o
rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o
batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o
voxel.o tree.o yolo_layer.o gaussian_yolo_layer.o upsample_layer.o lstm_layer.o
conv_lstm_layer.o scale_channels_layer.o sam_layer.o
ifeq ($(GPU), 1)
LDFLAGS+= -lstdc++
OBJ+=convolutional_kernels.o activation_kernels.o
im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o
dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o
avgpool_layer_kernels.o
endif
OBJS = $(addprefix $(OBJDIR), $(OBJ))
DEPS = $(wildcard src/*.h) Makefile include/darknet.h
all: $(OBJDIR) backup results setchmod $(EXEC)
$(LIBNAMESO) $(APPNAMESO)
ifeq ($(LIBSO), 1)
CFLAGS+= -fPIC
$(LIBNAMESO): $(OBJDIR) $(OBJS) include/yolo_v2_class.hpp
src/yolo_v2_class.cpp
$(CPP)
-shared -std=c++11 -fvisibility=hidden -DLIB_EXPORTS $(COMMON) $(CFLAGS)
$(OBJS) src/yolo_v2_class.cpp -o $@ $(LDFLAGS)
$(APPNAMESO): $(LIBNAMESO) include/yolo_v2_class.hpp
src/yolo_console_dll.cpp
$(CPP)
-std=c++11 $(COMMON) $(CFLAGS) -o $@ src/yolo_console_dll.cpp $(LDFLAGS) -L ./
-l:$(LIBNAMESO)
endif
$(EXEC): $(OBJS)
$(CPP)
-std=c++11 $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)
$(OBJDIR)%.o: %.c $(DEPS)
$(CC)
$(COMMON) $(CFLAGS) -c $< -o $@
$(OBJDIR)%.o: %.cpp $(DEPS)
$(CPP)
-std=c++11 $(COMMON) $(CFLAGS) -c $< -o $@
$(OBJDIR)%.o: %.cu $(DEPS)
$(NVCC)
$(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@
$(OBJDIR):
mkdir -p
$(OBJDIR)
backup:
mkdir -p
backup
results:
mkdir -p results
setchmod:
chmod +x *.sh
.PHONY: clean
clean:
rm -rf
$(OBJS) $(EXEC) $(LIBNAMESO) $(APPNAMESO)
SPP结构
Yolov4性能分析(下)的更多相关文章
- Yolov4性能分析(上)
Yolov4性能分析(上) 一.目录 实验测试 1) 测试介绍 2) Test 3) Train 二. 分析 1.实验测试 1. 1 实验测试方法 Yolov4训练train实验方法(Darkn ...
- Gson全解析(下)-Gson性能分析
前言 在之前的学习中,我们在Gson全解析(上)Gson使用的基础到分别运用了JsonSerializer和JsonDeserializer进行JSON和java实体类之间的相互转化. 在Gson全解 ...
- Centos下给PHP7添加Xhprof性能分析
什么是 Xhprof?XHProf是facebook 开发的一个测试php性能的扩展,本文记录了在PHP应用中使用XHProf对PHP进行性能优化,查找性能瓶颈的方法. 它报告函数级别的请求次数和各种 ...
- Linux下性能分析工具汇总
来自:http://os.51cto.com/art/201104/253114.htm 本文讲述的是:CPU性能分析工具.Memory性能分析工具.I/O性能分析工具.Network性能分析工具. ...
- linux下常见的性能分析工具
转载于:http://bian5399.blog.51cto.com/3848702/834715 性能调优的主要目的是使系统能够有效的利用各种资源,最大的发挥应用程序和系统之间的性能融合,使应用高效 ...
- YOLOV4知识点分析(二)
YOLOV4知识点分析(二) 6. 数据增强相关-mixup 论文名称:mixup: BEYOND EMPIRICAL RISK MINIMIZATION 论文地址:https://arxiv.org ...
- YOLOV4知识点分析(一)
YOLOV4知识点分析(一) 简 介 yolov4论文:YOLOv4: Optimal Speed and Accuracy of Object Detection arxiv:https://arx ...
- SQL Server-聚焦IN VS EXISTS VS JOIN性能分析(十九)
前言 本节我们开始讲讲这一系列性能比较的终极篇IN VS EXISTS VS JOIN的性能分析,前面系列有人一直在说场景不够,这里我们结合查询索引列.非索引列.查询小表.查询大表来综合分析,简短的内 ...
- SQL Server-聚焦NOT IN VS NOT EXISTS VS LEFT JOIN...IS NULL性能分析(十八)
前言 本节我们来综合比较NOT IN VS NOT EXISTS VS LEFT JOIN...IS NULL的性能,简短的内容,深入的理解,Always to review the basics. ...
随机推荐
- hdu3449 有依赖的背包问题
题意: 给你一些物品,每个物品有自己的价值和花费,每个物品都对应一个箱子,每个箱子有价钱,买这个物品必须买相应的箱子,给你一个价钱,问最多可以获得多少价值 <提示:多个物品可能同时对 ...
- POJ2060最小路径覆盖
题意: 有n个任务,如果时间来得及干完某些任务后还可以接着干别的任务,给一个任务清单,问最少派出去多少人能完成所有任务. 思路: 比较简单的追小路径覆盖问题了,在DAG中找到 ...
- 多线程-5.JMM之happens-before原则
a happens-before b 翻译为a操作对b操作是可见的.可见即是指共享变量的更改能获知. 特性:传递性 原则:volatile定义的变量 写操作 happens-before 读操作 同一 ...
- Canal详细入门实战(使用总结)
Canal介绍 Canal简介 canal [kə'næl],译意为水道/管道/沟渠,主要用途是基于 MySQL 数据库增量日志解析,提供增量数据订阅和消费 早期阿里巴巴因为杭州和美国双机房部署,存在 ...
- [并发编程 - 多线程:信号量、死锁与递归锁、时间Event、定时器Timer、线程队列、GIL锁]
[并发编程 - 多线程:信号量.死锁与递归锁.时间Event.定时器Timer.线程队列.GIL锁] 信号量 信号量Semaphore:管理一个内置的计数器 每当调用acquire()时内置计数器-1 ...
- 虚拟机快速下载安装配置aarch64-linux-gnu-gcc工具链
方式一:软件仓库安装 此方法不用自己去配置交叉编译工具链 1.查看本地仓库有支持哪些版本哪些 输入命令: apt-cache search aarch64 2.下载安装 gcc-8-aarch64-l ...
- Linux性能监控与分析之--- CPU
Linux性能监控与分析之--- CPU 望月成三人关注 2016.07.25 18:16:12字数 1,576阅读 2,837 CPU性能指标 用户进程使用CPU的比率 系统进程使用CPU的比率 W ...
- 如何在我的EC2实例状态更改时获取自定义电子邮件通知
具体详情,请参见: https://amazonaws-china.com/cn/premiumsupport/knowledge-center/ec2-email-instance-state-ch ...
- nginx重定向rewrite
引入rewrite vim /etc/nginx/conf.d/mobile_pc.conf server{ listen 80; server_name www.zls.com zls.com; r ...
- 010.kubernets的调度系统之daemonset
daemonset简单操作使用 Deployment 是 Kubernetes 中用于处理无状态服务的资源,而 StatefulSet 是用于支持有状态服务的资源,这两种不同的资源从状态的角度对服务进 ...