OpenCL 图像卷积 3 使用 CPU
▶ CPU 图像卷积,共四种方法。分别为基本串行,使用模板,使用局部内存,使用AVX指令优化
● 全部的代码,仅在主函数中选择调用的函数名即可。
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <opencv2/opencv.hpp> const char *inputFile = "R:/1.png";
const char *outputFile = "R:/2.png"; bool floatEq(const float a, const float b)// 相等返回 1
if (b == )
return fabs(a) < 0.001;
return fabs(a / b - ) < 0.001;
} void convolution01(const float *input, float *output, const int inputRow, const int inputCol,
const float *filter, const int filterWidth)
const int halfFilterWidth = filterWidth / ;
int row, col, rr, cc;
float sum; //memset(output, 0, sizeof(float) * inputRow * inputCol); // 使用 memset 将 output 全部凃成 0 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, output, inputRow, inputCol) private(row, col)
for (row = ; row < halfFilterWidth; row++) // 人工将边角涂成 0
for (col = ; col < inputCol; col++)
output[row*inputCol + col] = output[(inputRow - - row)*inputCol + col] = ;
#pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, output, inputRow, inputCol) private(row, col)
for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)
for (col = ; col < halfFilterWidth; col++)
output[row*inputCol + col] = output[row*inputCol + inputCol - - col] = ;
} #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, input, output, inputRow, inputCol, filter) private(row, col, rr, cc, sum)
for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)// 内部计算部分
for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col++)
for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];
output[row * inputCol + col] = sum;
for (row = 0; row < inputRow; row++) // 全范围循环,在最里层判断
for (col = 0; col < inputCol; col++)
if (row < halfFilterWidth || row >= inputRow - halfFilterWidth || col < halfFilterWidth || col >= inputCol - halfFilterWidth)
output[row * inputCol + col] = 0;
for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];
output[row * inputCol + col] = sum; }
} template<int filterWidth>
void convolution02(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 卷积宽度作为模板
const int halfFilterWidth = filterWidth / ;
int row, col, rr, cc;
float sum; memset(output, , sizeof(float) * inputRow * inputCol);
#pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, input, output, inputRow, inputCol, filter) private(row, col, rr, cc, sum)
for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)
for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col++)
for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];
output[row * inputCol + col] = sum;
} template<int filterWidth, int blockRow, int blockCol>
void convolution03(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 使用局部内存块
const int halfFilterWidth = filterWidth / ;
int row, col, rr, cc, i, j;
float filterElement; if (inputRow % blockRow || inputCol % blockCol) // 要求图片长宽为局部内存块的整数倍
printf("Failed, outputRow %% blockRow || outputCol %% blockCol\n");
} memset(output, , sizeof(float) * inputRow * inputCol);
#pragma omp parallel for num_threads(8)
for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row += blockRow)
for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col += blockCol)
float sum[blockRow * blockCol] = { 0.0f };
for (rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
filterElement = filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth];
for (i = ; i < blockRow; i++)
for (j = ; j < blockCol; j++)
if (row + rr + i >= inputRow || col + cc + j >= inputCol)
sum[i * blockCol + j] += filterElement * input[(row + rr + i) * inputCol + col + cc + j];
for (i = ; i < blockRow; i++)
for (j = ; j < blockCol; j++)
if (row + i >= inputRow || col + j >= inputCol)
output[(row + i) * inputCol + col + j] = sum[i * blockCol + j];
} template<int filterWidth, int blockRow, int blockCol>
void convolution04(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 使用 AVX 指令扩展
const int halfFilterWidth = filterWidth / ;
int row, col, rr, cc, i, j; if (inputRow % blockRow || inputCol % (blockCol * ))
printf("Failed, inputRow %% blockRow || inputCol %% blockCol\n");
} memset(output, , sizeof(float) * inputRow * inputCol);
#pragma omp parallel for num_threads(8)
for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row += blockRow)
for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col += blockCol * )
__m256 sum[blockRow * blockCol] = {_mm256_setzero_ps()};
for (rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
__m256 filterElement = _mm256_broadcast_ss(filter + (rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth);
for (i = ; i < blockRow; i++)
for (j = ; j < blockCol; j++)
//if (row + rr + i >= inputRow || col + cc + j * 8 >= inputCol)// 在局部内存块较大时需要越界检查
// continue;
__m256 imageElement = _mm256_loadu_ps(input + (row + rr + i)*inputCol + col + cc + j * );
sum[i * blockCol + j] = _mm256_fmadd_ps(filterElement, imageElement, sum[i * blockCol + j]);
for (i = ; i < blockRow; i++)
for (j = ; j < blockCol; j++)
//if (row + i >= inputRow || col + j * 8 >= inputCol)
// continue;
_mm256_storeu_ps(output + (row + i)*inputCol + col + j * , sum[i * blockCol + j]);
} int main()
int i, k;
clock_t time;
float filterSum; // 卷积窗口相关
const int filterWidth = , filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / ;
float filter[filterSize] =
{// 模糊窗口
, , , , ,
, .f / , .f / , .f / , ,
, .f / , .f / , .f / , ,
, .f / , .f / , .f / , ,
, , , ,
for (filterSum = 0.0f, i = ; i < filterSize; filterSum += filter[i++]);
if (!floatEq(filterSum, ))// 非归零的卷积窗口(如模糊)需要归一化
for (i = ; i < filterSize; filter[i] /= filterSum, i++); // 图片相关
cv::Mat input = cv::imread(inputFile), output = input, channel[];
cv::split(input, channel);
const int inputRow = input.rows, inputCol = input.cols, inputDataSize = inputRow * inputCol;
float *inputData = (float*)malloc(sizeof(float) * inputDataSize);
float *outputData = (float*)malloc(sizeof(float) * inputDataSize); for (k = ; k < ; k++)// 三个通道,分别为蓝、绿、红
for (i = ; i < inputRow * inputCol; inputData[i] = (float)channel[k].data[i], i++);
time = clock();
convolution01(inputData, outputData, inputRow, inputCol, (const float *)filter, filterWidth);
//convolution02<filterWidth>(inputData, outputData, inputRow, inputCol, filter);
//convolution03<filterWidth, 4, 4>(inputData, outputData, inputRow, inputCol, filter);
//convolution04<filterWidth, 4, 4>(inputData, outputData, inputRow, inputCol, filter);
time = clock() - time;
printf("Time for channel[%d]:%d ms\n", k, time);
for (i = ; i < inputRow * inputCol; channel[k].data[i] = (unsigned char)outputData[i], i++);
} cv::merge(channel, , output);
cv::imwrite(outputFile, output);
//imshow("merge", output);
return ;
● 输出结果,使用一张 4608 × 6656 的图片(bmp87.7MB)进行测试,使用主函数中那个边长为5、实际窗口长度为 3 的均值窗口。图片太大喘不上来,偷梁换柱成小图看效果
■ 计时结果
// convolution01,memset + 内部计算,无 OpenMP
Time for channel[]: ms
Time for channel[]: ms
Time for channel[]: ms Finished. // convolution01,手动除边 + 内部计算,无 OpenMP
Time for channel[]: ms
Time for channel[]: ms
Time for channel[]: ms Finished. // convolution01,循环内判断,无 OpenMP
Time for channel[]: ms
Time for channel[]: ms
Time for channel[]: ms Finished. // convolution01,手动除边 + 内部计算,有 OpenMP
Time for channel[]: ms
Time for channel[]: ms
Time for channel[]: ms Finished. // convolution02,有 OpenMP
Time for channel[]: ms
Time for channel[]: ms
Time for channel[]: ms Finished. // convolution03<filterWidth, 4, 4>,无 OpenMP
Time for channel[]: ms
Time for channel[]: ms
Time for channel[]: ms Finished. // convolution04<filterWidth, 4, 4>,无 OpenMP
Time for channel[]: ms
Time for channel[]: ms
Time for channel[]: ms Finished.
■ 没法给 convolution03 和 convolution04 加 OpenMP,一加就各种内存冲突,便捷判断都挡不住。
