OpenCL 图像卷积 3 使用 CPU

▶ CPU 图像卷积，共四种方法。分别为基本串行，使用模板，使用局部内存，使用AVX指令优化

● 全部的代码，仅在主函数中选择调用的函数名即可。

 #include <stdio.h>

 #include <stdlib.h>

 #include <time.h>

 #include <opencv2/opencv.hpp>

 const char *inputFile = "R:/1.png";

 const char *outputFile = "R:/2.png";

 bool floatEq(const float a, const float b)// 相等返回 1

 {

     if (b == )

         return fabs(a) < 0.001;

     return fabs(a / b - ) < 0.001;

 }

 void convolution01(const float *input, float *output, const int inputRow, const int inputCol,

     const float *filter, const int filterWidth)

 {

     const int halfFilterWidth = filterWidth / ;

     int row, col, rr, cc;

     float sum;

     //memset(output, 0, sizeof(float) * inputRow * inputCol);             // 使用 memset 将 output 全部凃成 0

 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, output, inputRow, inputCol) private(row, col)

     for (row = ; row < halfFilterWidth; row++)                         // 人工将边角涂成 0

     {

         for (col = ; col < inputCol; col++)

             output[row*inputCol + col] = output[(inputRow -  - row)*inputCol + col] = ;

     }

 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, output, inputRow, inputCol) private(row, col)

     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)

     {

         for (col = ; col < halfFilterWidth; col++)

             output[row*inputCol + col] = output[row*inputCol + inputCol -  - col] = ;

     }

 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, input, output, inputRow, inputCol, filter) private(row, col, rr, cc, sum)

     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)// 内部计算部分

     {

         for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col++)

         {

             for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)

             {

                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)

                     sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];

             }

             output[row * inputCol + col] = sum;

         }

     }

     /*

     for (row = 0; row < inputRow; row++)                                // 全范围循环，在最里层判断

     {

         for (col = 0; col < inputCol; col++)

         {

             if (row < halfFilterWidth || row >= inputRow - halfFilterWidth || col < halfFilterWidth || col >= inputCol - halfFilterWidth)

             {

                output[row * inputCol + col] = 0;

                 continue;

             }

             for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)

             {

                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)

                     sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];

             }

             output[row * inputCol + col] = sum;

         }

     }

     */

 }

 template<int filterWidth>

 void convolution02(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 卷积宽度作为模板

 {

     const int halfFilterWidth = filterWidth / ;

     int row, col, rr, cc;

     float sum;

     memset(output, , sizeof(float) * inputRow * inputCol);

 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, input, output, inputRow, inputCol, filter) private(row, col, rr, cc, sum)

     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)

     {

         for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col++)

         {

             for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)

             {

                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)

                     sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];

             }

             output[row * inputCol + col] = sum;

         }

     }

 }

 template<int filterWidth, int blockRow, int blockCol>

 void convolution03(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 使用局部内存块

 {

     const int halfFilterWidth = filterWidth / ;

     int row, col, rr, cc, i, j;

     float filterElement;

     if (inputRow % blockRow || inputCol % blockCol) // 要求图片长宽为局部内存块的整数倍

     {

         printf("Failed, outputRow %% blockRow || outputCol %% blockCol\n");

         return;

     }

     memset(output, , sizeof(float) * inputRow * inputCol);

 #pragma omp parallel for num_threads(8)

     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row += blockRow)

     {

         for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col += blockCol)

         {

             float sum[blockRow * blockCol] = { 0.0f };

             for (rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)

             {

                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)

                 {

                     filterElement = filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth];

                     for (i = ; i < blockRow; i++)

                     {

                         for (j = ; j < blockCol; j++)

                         {

                             if (row + rr + i >= inputRow || col + cc + j >= inputCol)

                                 break;

                             sum[i * blockCol + j] += filterElement * input[(row + rr + i) * inputCol + col + cc + j];

                         }

                     }

                 }

             }

             for (i = ; i < blockRow; i++)

             {

                 for (j = ; j < blockCol; j++)

                 {

                     if (row + i >= inputRow || col + j >= inputCol)

                         continue;

                     output[(row + i) * inputCol + col + j] = sum[i * blockCol + j];

                 }

             }

         }

     }

 }

 template<int filterWidth, int blockRow, int blockCol>

 void convolution04(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 使用 AVX 指令扩展

 {

     const int halfFilterWidth = filterWidth / ;

     int row, col, rr, cc, i, j;

     if (inputRow % blockRow || inputCol % (blockCol * ))

     {

         printf("Failed, inputRow %% blockRow || inputCol %% blockCol\n");

         return;

     }

     memset(output, , sizeof(float) * inputRow * inputCol);

 #pragma omp parallel for num_threads(8)

     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row += blockRow)

     {

         for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col += blockCol * )

         {

             __m256 sum[blockRow * blockCol] = {_mm256_setzero_ps()};

             for (rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)

             {

                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)

                 {

                     __m256 filterElement = _mm256_broadcast_ss(filter + (rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth);

                     for (i = ; i < blockRow; i++)

                     {

                         for (j = ; j < blockCol; j++)

                         {

                             //if (row + rr + i >= inputRow || col + cc + j * 8 >= inputCol)// 在局部内存块较大时需要越界检查

                             //    continue;

                             __m256 imageElement = _mm256_loadu_ps(input + (row + rr + i)*inputCol + col + cc + j * );

                             sum[i * blockCol + j] = _mm256_fmadd_ps(filterElement, imageElement, sum[i * blockCol + j]);

                         }

                     }

                 }

             }

             for (i = ; i < blockRow; i++)

             {

                 for (j = ; j < blockCol; j++)

                 {

                     //if (row + i >= inputRow || col + j * 8 >= inputCol)

                     //    continue;

                     _mm256_storeu_ps(output + (row + i)*inputCol + col + j * , sum[i * blockCol + j]);

                 }

             }

         }

     }

 }

 int main()

 {

     int i, k;

     clock_t time;

     float filterSum;

     // 卷积窗口相关

     const int filterWidth = , filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / ;

     float filter[filterSize] =

     {// 模糊窗口

         ,       ,       ,       , ,

         , .f / , .f / , .f / , ,

         , .f / , .f / , .f / , ,

         , .f / , .f / , .f / , ,

         ,       ,       ,       ,

     };

     for (filterSum = 0.0f, i = ; i < filterSize; filterSum += filter[i++]);

     if (!floatEq(filterSum, ))// 非归零的卷积窗口（如模糊）需要归一化

         for (i = ; i < filterSize; filter[i] /= filterSum, i++);

     // 图片相关

     cv::Mat input = cv::imread(inputFile), output = input, channel[];

     cv::split(input, channel);

     const int inputRow = input.rows, inputCol = input.cols, inputDataSize = inputRow * inputCol;

     float *inputData = (float*)malloc(sizeof(float) * inputDataSize);

     float *outputData = (float*)malloc(sizeof(float) * inputDataSize);

     for (k = ; k < ; k++)// 三个通道，分别为蓝、绿、红

     {

         for (i = ; i < inputRow * inputCol; inputData[i] = (float)channel[k].data[i], i++);

         time = clock();

         convolution01(inputData, outputData, inputRow, inputCol, (const float *)filter, filterWidth);

         //convolution02<filterWidth>(inputData, outputData, inputRow, inputCol, filter);

         //convolution03<filterWidth, 4, 4>(inputData, outputData, inputRow, inputCol, filter);

         //convolution04<filterWidth, 4, 4>(inputData, outputData, inputRow, inputCol, filter);

         time = clock() - time;

         printf("Time for channel[%d]:%d ms\n", k, time);

         for (i = ; i < inputRow * inputCol; channel[k].data[i] = (unsigned char)outputData[i], i++);

     }

     cv::merge(channel, , output);

     cv::imwrite(outputFile, output);

     //imshow("merge", output);

     //cv::waitKey(0);

     free(inputData);

     free(outputData);

     printf("\nFinished.\n");

     getchar();

     return ;

 }

● 输出结果，使用一张 4608 × 6656 的图片（bmp87.7MB）进行测试，使用主函数中那个边长为5、实际窗口长度为 3 的均值窗口。图片太大喘不上来，偷梁换柱成小图看效果

■ 计时结果

// convolution01，memset + 内部计算，无 OpenMP

Time for channel[]: ms

Time for channel[]: ms

Time for channel[]: ms

Finished.

// convolution01，手动除边 + 内部计算，无 OpenMP

Time for channel[]: ms

Time for channel[]: ms

Time for channel[]: ms

Finished.

// convolution01，循环内判断，无 OpenMP

Time for channel[]: ms

Time for channel[]: ms

Time for channel[]: ms

Finished.

// convolution01，手动除边 + 内部计算，有 OpenMP

Time for channel[]: ms

Time for channel[]: ms

Time for channel[]: ms

Finished.

// convolution02，有 OpenMP

Time for channel[]: ms

Time for channel[]: ms

Time for channel[]: ms

Finished.

// convolution03<filterWidth, 4, 4>，无 OpenMP

Time for channel[]: ms

Time for channel[]: ms

Time for channel[]: ms

Finished.

// convolution04<filterWidth, 4, 4>，无 OpenMP

Time for channel[]: ms

Time for channel[]: ms

Time for channel[]: ms

Finished.

■ 没法给 convolution03 和 convolution04 加 OpenMP，一加就各种内存冲突，便捷判断都挡不住。

■

OpenCL 图像卷积 3 使用 CPU的更多相关文章

OpenCL 图像卷积 2
▶ 上一篇图像卷积 http://www.cnblogs.com/cuancuancuanhao/p/8535569.html.这篇使用了 OpenCV 从文件读取彩色的 jpeg 图像,进行边缘检测 ...
OpenCL 图像卷积 1
▶ 书上的代码改进而成,从文件读入一张 256 阶灰度图,按照给定的卷积窗口计算卷积,并输出到文件中. ● 代码,使用 9 格的均值窗口,居然硬读写 .bmp 文件,算是了解一下该文件的具体格式,留作 ...
SSE图像算法优化系列十一：使用FFT变换实现图像卷积。
本文重点主要不在于FFT的SSE优化,而在于使用FFT实现快速卷积的相关技巧和过程. 关于FFT变换,有很多参考的代码,特别是对于长度为2的整数次幂的序列,实现起来也是非常简易的,而对于非2次幂的序列 ...
图像卷积、相关以及在MATLAB中的操作
图像卷积.相关以及在MATLAB中的操作 2016年7月11日 20:34:35, By ChrisZZ 区分卷积和相关图像处理中常常需要用一个滤波器做空间滤波操作.空间滤波操作有时候也被叫做卷积滤 ...
zz图像卷积与滤波的一些知识点
Xinwei: 写的通俗易懂,终于让我这个不搞CV.不搞图像的外行理解卷积和滤波了. 图像卷积与滤波的一些知识点 zouxy09@qq.com http://blog.csdn.net/zouxy09 ...
对抗生成网络-图像卷积-mnist数据生成(代码) 1.tf.layers.conv2d(卷积操作) 2.tf.layers.conv2d_transpose(反卷积操作) 3.tf.layers.batch_normalize(归一化操作) 4.tf.maximum(用于lrelu) 5.tf.train_variable(训练中所有参数) 6.np.random.uniform(生成正态数据
1. tf.layers.conv2d(input, filter, kernel_size, stride, padding) # 进行卷积操作参数说明:input输入数据, filter特征图的 ...
UFLDL教程笔记及练习答案五（自编码线性解码器与处理大型图像**卷积与池化）
自己主动编码线性解码器自己主动编码线性解码器主要是考虑到稀疏自己主动编码器最后一层输出假设用sigmoid函数.因为稀疏自己主动编码器学习是的输出等于输入.simoid函数的值域在[0,1]之间,这 ...
TensorFlow实现图像卷积并可视化示例
图片尺寸要自己修改. 看起来好像没啥意思,不知道下一步能干什么,先卷了再说.由于weights是随机生成的(tf.random_normal作用:用于从服从指定正太分布的数值中取出随机数),所以每次卷 ...
opencv：图像卷积
卷积基本概念 C++代码实现卷积 #include <opencv2/opencv.hpp> #include <iostream> using namespace cv; u ...

随机推荐

HDU 4548：美素数
Problem Description 小明对数的研究比较热爱,一谈到数,脑子里就涌现出好多数的问题,今天,小明想考考你对素数的认识. 问题是这样的:一个十进制数,如果是素数,而且它的各位数字和也是素 ...
Nginx访问限制模块limit_conn_zone 和limit_req_zone配置使用
nginx可以通过limit_conn_zone 和limit_req_zone两个组件来对客户端访问目录和文件的访问频率和次数进行限制,另外还可以善用进行服务安全加固,两个模块都能够对客户端访问进行 ...
signal信号
1.signal信号调试 http://hongjiang.info/shell-script-background-process-ignore-sigint/
day31 python学习操作系统的介绍，
一背景知识顾名思义,进程即正在执行的一个过程.进程是对正在运行程序的一个抽象. 进程的概念起源于操作系统,是操作系统最核心的概念,也是操作系统提供的最古老也是最重要的抽象概念之一.操作系统的其他所 ...
dbt 基本试用
dbt 是一个很不错的进行etl 中的t 处理的工具,灵活简单,我们需要写的就是select 语句 dbt 帮助我们进行处理测试集成了graphql 以及使用docker 运行安装 pip ins ...
timescaledb 集成prometheus
timescaledb 1.0 已经发布了,同时支持prometheus 使用doker-compose 运行环境准备 docker-compose 文件 version: '2.1' servic ...
转 DataTorrent 1.0每秒处理超过10亿个实时事件
DataTorrent是一个实时的流式处理和分析平台,它每秒可以处理超过10亿个实时事件. 与Twitter平均每秒大约6000条微博相比,最近发布的DataTorrent 1.0似乎已经超出了需求, ...
win7上搭建Android环境及调试
工欲善其事必先利其器,好记性不如烂笔头.要学习一门新的语言,首先必须得先搭环境,否则没法实践.如果之前按照网上的提示,搭建过环境,而且环境比较复杂的话,我相信隔很长一段时间后,就会忘记,到真正用的时候 ...
php 备份和恢复数据库
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...
nyoj 探寻宝藏
探寻宝藏时间限制:1000 ms | 内存限制:65535 KB 难度:5 描述传说HMH大沙漠中有一个M*N迷宫,里面藏有许多宝物.某天,Dr.Kong找到了迷宫的地图,他发现迷宫内处 ...

OpenCL 图像卷积 3 使用 CPU

OpenCL 图像卷积 3 使用 CPU的更多相关文章

随机推荐

热门专题