OpenCL 图像卷积 1

▶ 书上的代码改进而成，从文件读入一张 256 阶灰度图，按照给定的卷积窗口计算卷积，并输出到文件中。

● 代码，使用 9 格的均值窗口，居然硬读写 .bmp 文件，算是了解一下该文件的具体格式，留作纪念吧。

 // convolution.cl

 __kernel void convolution01(__read_only image2d_t inputImage, __write_only image2d_t outputImage,

     int imageRow, int imageCol, __constant float* filter, int filterWidth, sampler_t sampler)

 {

     const int row = get_global_id(), col = get_global_id();   // 注意工作项的顺序，可以和图像读取不一样

     const int halfWidth = filterWidth / ;

     float4 sum = { 0.0f, 0.0f, 0.0f, 0.0f }, pixel;             // 输出数据类型是四元浮点数

     int i, j, filterIdx;                                        // 卷积窗口单独用一个下标遍历

     for (filterIdx = , i = -halfWidth; i <= halfWidth; i++)

     {

         for (j = -halfWidth; j <= halfWidth; j++)

         {

             pixel = read_imagef(inputImage, sampler, (int2)(col + j, row + i)); // 读取目标坐标，注意列在前行在后

             sum.x += pixel.x * filter[filterIdx++];                             // 采用了单通道，只有第一分量有效

         }

     }

     if (row < imageRow && col < imageCol)                       // 将落在有效范围内的计算数据输出

         write_imagef(outputImage, (int2)(col, row), sum);

     return;

 }

 // main.cpp

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #include <cl.h>

 #pragma warning(disable : 4996)

 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl";

 const char *inputFile = "R:/input.bmp";

 const char *outputFile = "R:/output.bmp";

 bool floatEq(const float a, const float b)// 相等返回 1

 {

     return (b == ) ? fabs(a) < 0.001 : fabs(a / b - ) < 0.001;

 }

 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode，返回字符串长度

 {

     FILE *fp;

     int size;

     //printf("<readText> File: %s\n", kernelPath);

     fopen_s(&fp, kernelPath, "rb");

     if (!fp)

     {

         printf("<readText> Open file failed\n");

         getchar();

         exit(-);

     }

     if (fseek(fp, , SEEK_END) != )

     {

         printf("<readText> Seek end of file failed\n");

         getchar();

         exit(-);

     }

     if ((size = ftell(fp)) < )

     {

         printf("<readText> Get file position failed\n");

         getchar();

         exit(-);

     }

     rewind(fp);

     if ((*pcode = (char *)malloc(size + )) == NULL)

     {

         printf("<readText> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fread(*pcode, , size, fp);

     (*pcode)[size] = '\0';

     fclose(fp);

     return size + ;

 }

 void storeImage(float *imageOut, const char *filename, const char *refFilename)// 输出图片

 {

     FILE *ifp, *ofp;

     unsigned char *metaData, temp;

     int offset, i, j, row, col, mod;

     if (fopen_s(&ifp, refFilename, "rb") != )// 从 参考图片（输入文件）中读取需要的行列数

     {

         printf(filename);

         exit(-);

     }

     fseek(ifp, , SEEK_SET);

     fread(&offset, , , ifp);

     fseek(ifp, , SEEK_SET);

     fread(&col, , , ifp);

     fread(&row, , , ifp);

     fseek(ifp, , SEEK_SET);

     if ((metaData = (unsigned char *)malloc(offset)) == NULL)

     {

         printf("<storeImage> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fread(metaData, , offset, ifp);                  // 从输入文件中读取元信息

     //printf("Output image %s\n", filename);

     if (fopen_s(&ofp, filename, "wb") != )

     {

         printf("<storeImage> Open output file failed\n");

         getchar();

         exit(-);

     }

     if (fwrite(metaData, , offset, ofp) != offset)   //  将元信息原封不动的放入输出文件中

     {

         printf("<storeImage> Write output metaData failed\n");

         getchar();

         exit(-);

     }

     for (i = row - , mod = (col %  ==  ?  :  - col % ); i >= ; i--)// .bmp 行是颠倒的，倒着填充

     {

         for (j = ; j < col; j++)

         {

             temp = (unsigned char)imageOut[i * col + j];

             fwrite(&temp, sizeof(unsigned char), , ofp);

         }

         for (j = ; j < mod; fwrite(&temp, sizeof(unsigned char), , ofp), j++);// 列数非 4 的倍数时补上 junk padding

     }

     fclose(ifp);

     fclose(ofp);

     free(metaData);

     return;

 }

 float *readImage(const char *filename, int *outputRow, int *outputCol)// 从文件读取图片

 {

     unsigned char temp;

     int i, j, row, col, offset, mod;

     float *outputImage;

     FILE *fp;

     if (fopen_s(&fp, filename, "rb") != )

     {

         printf("<readImage> Open file failed\n");

         getchar();

         exit(-);

     }

     fseek(fp, , SEEK_SET);    // 第 10 字节的位置

     fread(&offset, , , fp);   // 元信息大小

     fseek(fp, , SEEK_SET);    // 第 18 字节位置

     fread(&col, , , fp);      // 读取列数和行数

     fread(&row, , , fp);

     printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row);

     if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL)

     {

         printf("<readImage> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fseek(fp, offset, SEEK_SET);// 元信息结束的地方，开始读图像数据

     fflush(NULL);

     for (i = row - , mod = (col %  ==  ?  :  - col % ); i >= ; i--)// .bmp 行是颠倒的，顺着读文件，倒着填充，mod 为列的 junk pading 厚度

     {

         for (j = ; j < col; j++)

         {

             fread(&temp, sizeof(unsigned char), , fp);

             outputImage[i * col + j] = (float)temp;

         }

         for (j = ; j < mod; fread(&temp, sizeof(unsigned char), , fp), j++);// 读取 junk padding，不传入数据中(.bmp 文件中有这几列，但是不显示)

     }

     fclose(fp);

     *outputRow = row;

     *outputCol = col;

     return outputImage;

 }

 int main()

 {

     int imageRow, imageCol, dataSize, row, col, i, j, correct;

     float *inputImage, *outputImage, sum;

     inputImage = readImage(inputFile, &imageRow, &imageCol);// 从文件读取图像数据和行列信息

     dataSize = imageRow * imageCol * sizeof(float);

     outputImage = (float*)malloc(dataSize);

     const int filterWidth = , filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / ;

     float filter[] =

     { ,      ,      ,      ,      ,      ,      ,

         ,      ,      ,      ,      ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,      ,      ,      ,      ,      ,

         ,      ,      ,      ,      ,      ,

     };

     cl_int status;

     cl_uint nPlatform;

     clGetPlatformIDs(, NULL, &nPlatform);

     cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));

     clGetPlatformIDs(nPlatform, listPlatform, NULL);

     cl_uint nDevice = ;

     clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);

     cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));

     clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);

     cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);

     cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status);

     cl_image_format format;                     // 图像格式描述符

     format.image_channel_order = CL_R;          // 单通道

     format.image_channel_data_type = CL_FLOAT;  // 浮点类型（读进来还是 uchar，但是计算需要浮点）

     cl_mem d_inputImage, d_outputImage, d_filter;

     if (true)// 旧 OpenCL 函数 clCreateImage2D

     {

         d_inputImage = clCreateImage2D(context, , &format, imageCol, imageRow, , NULL, &status);

         d_outputImage = clCreateImage2D(context, , &format, imageCol, imageRow, , NULL, &status);

     }

     if (false)// 新 OpenCL 使用描述符图像描述符 cl_image_desc 和函数 clCreateImage

     {

         cl_image_desc desc;

         desc.image_type = CL_MEM_OBJECT_IMAGE2D;

         desc.image_width = imageCol;

         desc.image_height = imageRow;

         desc.image_depth = ;

         desc.image_array_size = ;

         desc.image_row_pitch = ;

         desc.image_slice_pitch = ;

         desc.num_mip_levels = ;

         desc.num_samples = ;

         desc.buffer = NULL;

         d_inputImage = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, &status);

         d_outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, &status);

     }

     d_filter = clCreateBuffer(context, , filterSize * sizeof(float), NULL, &status);

     size_t origin[] = { , ,  }, region[] = { imageCol, imageRow,  };  // 拷贝图像数据用的原点和尺寸，注意尺寸是先数列再数行

     clEnqueueWriteImage(queue, d_inputImage, CL_TRUE, origin, region, , , inputImage, , NULL, NULL);

     clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, , filterSize * sizeof(float), filter, , NULL, NULL);

     cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);// 采样器

     char *code;

     size_t length = readText(sourceText, &code);

     cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, NULL);

     clBuildProgram(program, , listDevice, NULL, NULL, NULL);

     cl_kernel kernel = clCreateKernel(program, "convolution01", &status);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_inputImage);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_outputImage);

     clSetKernelArg(kernel, , sizeof(int), &imageRow);

     clSetKernelArg(kernel, , sizeof(int), &imageCol);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_filter);

     clSetKernelArg(kernel, , sizeof(int), &filterWidth);

     clSetKernelArg(kernel, , sizeof(cl_sampler), &sampler);

     size_t globalSize[] = { imageRow, imageCol };// localSize = { 1, 1}; 可以用 NULL 代替

     status = clEnqueueNDRangeKernel(queue, kernel, , NULL, globalSize, NULL, , NULL, NULL);

     clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, region, , , outputImage, , NULL, NULL);

     storeImage(outputImage, outputFile, inputFile);// 将输出图像写入文件中

     for (row = , correct = ; row < imageRow && correct; row++)// 检查计算结果

     {

         for (col = ; col < imageCol && correct; col++)

         {

             sum = ;

             for (i = -halfFilterWidth; i <= halfFilterWidth; i++)

             {

                 for (j = -halfFilterWidth; j <= halfFilterWidth; j++)

                 {

                     if (row + i >=  && row + i < imageRow && col + j >=  && col + j < imageCol)

                         sum += inputImage[(row + i) * imageCol + col + j] * filter[(i + halfFilterWidth) * filterWidth + j + halfFilterWidth];

                 }

             }

             if (row >= halfFilterWidth && row < imageRow - halfFilterWidth && col >= halfFilterWidth && col < imageCol - halfFilterWidth &&

                 !floatEq(outputImage[row * imageCol + col], sum))

             {

                 printf("Error at [%d,%d], output:%f, ref:%f\n", row, col, outputImage[row*imageCol + col], sum);

                 correct = ;

             }

         }

     }

     if (correct)

         printf("Result correct.\n");

     free(listPlatform);

     free(listDevice);

     free(inputImage);

     free(outputImage);

     free(code);

     clReleaseContext(context);

     clReleaseCommandQueue(queue);

     clReleaseProgram(program);

     clReleaseKernel(kernel);

     clReleaseMemObject(d_inputImage);

     clReleaseMemObject(d_outputImage);

     clReleaseMemObject(d_filter);

     clReleaseSampler(sampler);

     getchar();

     return ;

 }

● 输出结果，给了一张 400 × 400 的图片参与，另外，计算一张 5040 × 7000 的图片需要 23 ms。

<readImage> Input image R:/input.bmp, width = , height =

Output image R:/output.bmp

Result correct.

● 代码，使用局部内存优化

 // convolution.cl

 __kernel void convolution02(__global float* inputImage, __global float* outputImage, int  imageRow, int  imageCol,

     __constant float* filter, int filterWidth, __local float* localMem, int  localMemRow, int  localMemCol)

 {

     const int groupCol = get_group_id() * get_local_size(), groupRow = get_group_id() * get_local_size();

     const int localCol = get_local_id(), localRow = get_local_id();

     const int globalCol = groupCol + localCol, globalRow = groupRow + localRow;

     const int halfWidth = filterWidth / ;

     int i, j, curRow, curCol, filterIndex;

     float sum;

     // 将源图数据读入局部内存

     for (i = localRow; i < localMemRow; i += get_local_size())

     {

         curRow = groupRow + i;

         for (j = localCol; j < localMemCol; j += get_local_size())

         {

             curCol = groupCol + j;

             if (curRow < imageRow && curCol < imageCol)

                 localMem[i * localMemCol + j] = inputImage[curRow * imageCol + curCol];

         }

     }

     barrier(CLK_LOCAL_MEM_FENCE);     

     // 计算卷积

     if (globalRow < imageRow - filterWidth +  && globalCol < imageCol - filterWidth + )// 选取位于有效范围内的工作组

     {

         sum = 0.0f, filterIndex = ;

         for (i = localRow; i < localRow + filterWidth; i++)

         {

             for (j = localCol; j < localCol + filterWidth; j++)

                 sum += localMem[i * localMemCol + j] * filter[filterIndex++];

         }

         // 循环展开

         /*

         for (i = localRow; i < localRow + filterWidth; i++)

         {

             int offset = i * localMemCol + localCol;

             sum += localMem[offset++] * filter[filterIndex++];// 行数等于 filterWidth

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

         }

         */

         // 循环完全展开

         /*

         int offset = localRow*localMemCol + localCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         */

         // 数据输出

         outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = sum;

     }

     return;

 }

 __kernel void convolution03(__global float4* inputImage, __global float* outputImage, int  imageRow, int imageCol,

     __constant float* filter, int filterWidth, __local float* localMem, int  localMemRow, int  localMemCol)

 {

     const int groupCol4 = get_group_id() * get_local_size() / , groupRow4 = get_group_id() * get_local_size();

     const int localId = get_local_id() * get_local_size() + get_local_id();

     int localCol = (localId % (localMemCol / )), localRow = (localId / (localMemCol / ));

     int globalCol = groupCol4 + localCol, globalRow = groupRow4 + localRow;

     const int halfWidth = filterWidth / ; 

     __local float4 *localImage4 = (__local float4*)&localMem[localRow*localMemCol + localCol * ];// 局部内存数据

     if (globalRow < imageRow && globalCol < imageCol /  && localRow < localMemRow)

         localImage4[] = inputImage[globalRow*imageCol /  + globalCol];

     barrier(CLK_LOCAL_MEM_FENCE);

     // 重设 坐标以输出

     localCol = get_local_id();

     localRow = get_local_id();

     globalCol = get_group_id()*get_local_size() + localCol;

     globalRow = get_group_id()*get_local_size() + localRow;

     // 计算卷积

     int i, j, filterIndex;

     float sum;

     if (globalRow < imageRow - filterWidth +  && globalCol < imageCol - filterWidth + )

     {

         sum = 0.0f, filterIndex = ;

         for (i = localRow; i < localRow + filterWidth; i++)

         {

             for (int j = localCol; j < localCol + filterWidth; j++)

                 sum += localMem[i * localMemCol + j] * filter[filterIndex++];

         }

         // 循环展开同 convolution02

         // 输出数据

         outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = ;//sum;

     }

     return;

 }

 // main.cpp

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #include <cl.h>

 #pragma warning(disable : 4996)

 #define NON_OPTIMIZED   // 不使用优化，使用函数 convolution02

 //#define READ_ALIGNED  // 使用内存对齐优化，使用函数 convolution02

 //#define READ4         // 局部内存使用 float4 读取优化，使用函数 convolution03，有点问题

 #define WGX 16          // 工作组尺寸

 #define WGY 16

 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl";

 const char *inputFile = "R:/input.bmp";

 const char *outputFile = "R:/output.bmp";

 unsigned int roundUp(unsigned int value, unsigned int base)// 将 value 向上取整到 multiple 的整数倍

 {

     unsigned int remainder = value % base;

     return remainder ==  ? value : (value + base - remainder);

 }

 int readText(const char* kernelPath, char **pcode)

 {

     FILE *fp;

     int size;

     //printf("<readText> File: %s\n", kernelPath);

     fopen_s(&fp, kernelPath, "rb");

     if (!fp)

     {

         printf("<readText> Open file failed\n");

         getchar();

         exit(-);

     }

     if (fseek(fp, , SEEK_END) != )

     {

         printf("<readText> Seek end of file failed\n");

         getchar();

         exit(-);

     }

     if ((size = ftell(fp)) < )

     {

         printf("<readText> Get file position failed\n");

         getchar();

         exit(-);

     }

     rewind(fp);

     if ((*pcode = (char *)malloc(size + )) == NULL)

     {

         printf("<readText> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fread(*pcode, , size, fp);

     (*pcode)[size] = '\0';

     fclose(fp);

     return size + ;

 }

 void storeImage(float *imageOut, const char *filename, const char *refFilename)

 {

     FILE *ifp, *ofp;

     unsigned char *metaData, temp;

     int offset, i, j, row, col, mod;

     if (fopen_s(&ifp, refFilename, "rb") != )

     {

         printf(filename);

         exit(-);

     }

     fseek(ifp, , SEEK_SET);

     fread(&offset, , , ifp);

     fseek(ifp, , SEEK_SET);

     fread(&col, , , ifp);

     fread(&row, , , ifp);

     fseek(ifp, , SEEK_SET);

     if ((metaData = (unsigned char *)malloc(offset)) == NULL)

     {

         printf("<storeImage> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fread(metaData, , offset, ifp);

     if (fopen_s(&ofp, filename, "wb") != )

     {

         printf("<storeImage> Open output file failed\n");

         getchar();

         exit(-);

     }

     if (fwrite(metaData, , offset, ofp) != offset)

     {

         printf("<storeImage> Write output metaData failed\n");

         getchar();

         exit(-);

     }

     for (i = row - , mod = (col %  ==  ?  :  - col % ); i >= ; i--)

     {

         for (j = ; j < col; j++)

         {

             temp = (unsigned char)imageOut[i * col + j];

             fwrite(&temp, sizeof(unsigned char), , ofp);

         }

         for (j = ; j < mod; fwrite(&temp, sizeof(unsigned char), , ofp), j++);

     }

     fclose(ifp);

     fclose(ofp);

     free(metaData);

     return;

 }

 float *readImage(const char *filename, int *outputRow, int *outputCol)

 {

     unsigned char temp;

     int i, j, row, col, offset, mod;

     float *outputImage;

     FILE *fp;

     if (fopen_s(&fp, filename, "rb") != )

     {

         printf("<readImage> Open file failed\n");

         getchar();

         exit(-);

     }

     fseek(fp, , SEEK_SET);

     fread(&offset, , , fp);

     fseek(fp, , SEEK_SET);

     fread(&col, , , fp);

     fread(&row, , , fp);

     printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row);

     if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL)

     {

         printf("<readImage> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fseek(fp, offset, SEEK_SET);

     fflush(NULL);

     for (i = row - , mod = (col %  ==  ?  :  - col % ); i >= ; i--)

     {

         for (j = ; j < col; j++)

         {

             fread(&temp, sizeof(unsigned char), , fp);

             outputImage[i * col + j] = (float)temp;

         }

         for (j = ; j < mod; fread(&temp, sizeof(unsigned char), , fp), j++);

     }

     fclose(fp);

     *outputRow = row;

     *outputCol = col;

     return outputImage;

 }

 int main()

 {

     int imageRow, imageCol, dataSize, deviceRow, deviceCol, deviceDataSize;

     float *inputImage, *outputImage;

     inputImage = readImage(inputFile, &imageRow, &imageCol);

     dataSize = imageRow * imageCol * sizeof(float);

     outputImage = (float*)malloc(dataSize);

     // 调整列数

 #ifdef NON_OPTIMIZED        // 不调整

     deviceCol = imageCol;

 #else                       // 增加道工作组尺寸的整数倍

     deviceCol = roundUp(imageCol, WGX);

 #endif

     deviceRow = imageRow;   // 行数不变

     deviceDataSize = sizeof(float) * deviceRow * deviceCol;

     const int filterWidth = , filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / ;

     float filter[] =

     {

         ,      ,      ,      ,      ,      ,      ,

         ,      ,      ,      ,      ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,      ,      ,      ,      ,      ,

         ,      ,      ,      ,      ,      ,

     };

     cl_int status;

     cl_uint nPlatform;

     clGetPlatformIDs(, NULL, &nPlatform);

     cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));

     clGetPlatformIDs(nPlatform, listPlatform, NULL);

     cl_uint nDevice = ;

     clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);

     cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));

     clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);

     cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);

     cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status);

     // 使用普通的缓冲区，而不用 image

     cl_mem d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL);

     cl_mem d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL);

     cl_mem d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * filterSize, NULL, NULL);

     // 缓冲区写入

 #if defined NON_OPTIMIZED   // 直接写入

     clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, , deviceDataSize, inputImage, , NULL, NULL);

 #else                       // 对齐写入

     size_t d_origin[] = { ,, }, h_origin[] = { ,, }, region[] = { sizeof(float) * deviceCol, deviceRow,  };

     clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, , sizeof(float) * imageCol, , inputImage, , NULL, NULL);

 #endif

     clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, , sizeof(float) * filterSize, filter, , NULL, NULL);

     char *code;

     size_t length = readText(sourceText, &code);

     cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, &status);

     status = clBuildProgram(program, , listDevice, NULL, NULL, NULL);

     // 创建不同的内核

 #if defined NON_OPTIMIZED || defined READ_ALIGNED

     cl_kernel kernel = clCreateKernel(program, "convolution02", NULL);

 #else

     cl_kernel kernel = clCreateKernel(program, "convolution03", NULL);

 #endif

     size_t globalSize[] = { roundUp(imageCol - filterWidth + , WGX), roundUp(imageRow - filterWidth + , WGY) }, localSize[] = { WGX, WGY };

     // 局部内存大小

     int localRow, localCol;

     localRow = localSize[] + filterWidth - ;              // 把一个工作组的大小垫起光环元素的宽度

 #if defined NON_OPTIMIZED || defined READ_ALIGNED

     localCol = localSize[] + filterWidth - ;

 #else

     localCol = roundUp(localSize[] + filterWidth - , );  //  垫起之外还要对齐到 4 的倍数上

 #endif

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_inputImage);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_outputImage);

     clSetKernelArg(kernel, , sizeof(int), &deviceRow);

     clSetKernelArg(kernel, , sizeof(int), &deviceCol);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_filter);

     clSetKernelArg(kernel, , sizeof(int), &filterWidth);

     clSetKernelArg(kernel, , sizeof(float) * localCol * localRow, NULL);

     clSetKernelArg(kernel, , sizeof(int), &localRow);

     clSetKernelArg(kernel, , sizeof(int), &localCol);

     status = clEnqueueNDRangeKernel(queue, kernel, , NULL, globalSize, localSize, , NULL, NULL);

     // 结果写回

 #if defined NON_OPTIMIZED

     clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, , deviceDataSize, outputImage, , NULL, NULL);

 #else   // 最边上一圈 filterWidth / 2 的部分不写回

     d_origin[] =  * sizeof(float), d_origin[] = , d_origin[] = ;

     h_origin[] =  * sizeof(float), h_origin[] = , h_origin[] = ;

     region[] = (imageCol - filterWidth + ) * sizeof(float), region[] = (imageRow - filterWidth + ), region[] = ;

     clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, , sizeof(float) * imageCol, , outputImage, , NULL, NULL);

 #endif

     storeImage(outputImage, outputFile, inputFile);

     // 去掉了检查结果的部分

     free(listPlatform);

     free(listDevice);

     free(inputImage);

     free(outputImage);

     free(code);

     clReleaseContext(context);

     clReleaseCommandQueue(queue);

     clReleaseProgram(program);

     clReleaseKernel(kernel);

     clReleaseMemObject(d_inputImage);

     clReleaseMemObject(d_outputImage);

     clReleaseMemObject(d_filter);

     printf("Finshed.\n");

     getchar();

     return ;

 }

● 输出结果，与上面的简单方法相同

● 用到的函数和定义

 //cl.h

 // 采样器越界处理方案

 #define CL_ADDRESS_NONE                             0x1130

 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131

 #define CL_ADDRESS_CLAMP                            0x1132

 #define CL_ADDRESS_REPEAT                           0x1133

 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134

 // 插值方案

 #define CL_FILTER_NEAREST                           0x1140

 #define CL_FILTER_LINEAR                            0x1141

 // 用到的采样器和描述符的定义

 typedef struct _cl_sampler* cl_sampler;

 typedef struct _cl_image_format

 {

     cl_channel_order        image_channel_order;

     cl_channel_type         image_channel_data_type;

 } cl_image_format;

 typedef struct _cl_image_desc

 {

     cl_mem_object_type      image_type;

     size_t                  image_width;

     size_t                  image_height;

     size_t                  image_depth;

     size_t                  image_array_size;

     size_t                  image_row_pitch;

     size_t                  image_slice_pitch;

     cl_uint                 num_mip_levels;

     cl_uint                 num_samples;

     cl_mem                  buffer;

 } cl_image_desc;

 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage2D(// OpenCL1.2 中废弃的 image 创建函数

     cl_context,             // 上下文

     cl_mem_flags,           // 特殊标志

     const cl_image_format *,// image 描述符

     size_t,                 // 宽度

     size_t,                 // 高度

     size_t,                 // 行跨步

     void *,                 // 自动传入主机数据

     cl_int *                // 返回结果状态的指针

 ) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImage(// OpenCL1.2 中开始使用的 image 创建函数

     cl_context,             // 上下文

     cl_mem_flags,           // 特殊标志

     const cl_image_format *,// image 格式描述符

     const cl_image_desc *,  // image 描述符

     void *,                 // 主机数据

     cl_int *                // 返回结果状态的指针

 ) CL_API_SUFFIX__VERSION_1_2;

 extern CL_API_ENTRY cl_sampler CL_API_CALL clCreateSampler(// 初始化采样器

     cl_context,         // 上下文

     cl_bool,            // 是否使用归一化坐标

     cl_addressing_mode, // 越界处理方案

     cl_filter_mode,     // 差值方案

     cl_int *            // 返回结果状态的指针

 ) CL_API_SUFFIX__VERSION_1_0;

 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBufferRect(// 矩形缓冲区写入

     cl_command_queue,   // 命令队列

     cl_mem,             // 目标缓冲区

     cl_bool,            // 阻塞标记

     const size_t *,     // 缓冲区写入起点

     const size_t *,     // 源数据写入起点

     const size_t *,     // 写入范围，是一个三维数组，分别为：一行数据量（Byte），行数，层数

     size_t,             // 缓冲区行间跨度

     size_t,             // 缓冲区层间跨度

     size_t,             // 源数据行间跨度

     size_t,             // 源数据层间跨度

     const void *,       // 源数据指针

     cl_uint,            // 时间列表长度

     const cl_event *,   // 时间列表

     cl_event *          // 本事件标记

 ) CL_API_SUFFIX__VERSION_1_1;

 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBufferRect(// 矩形缓冲区读出，参数定义同上

     cl_command_queue,

     cl_mem,

     cl_bool,

     const size_t *,

     const size_t *,

     const size_t *,

     size_t,

     size_t,

     size_t,

     size_t,

     void *,

     cl_uint,

     const cl_event *,

     cl_event *

 ) CL_API_SUFFIX__VERSION_1_1;

OpenCL 图像卷积 1的更多相关文章

OpenCL 图像卷积 2
▶ 上一篇图像卷积 http://www.cnblogs.com/cuancuancuanhao/p/8535569.html.这篇使用了 OpenCV 从文件读取彩色的 jpeg 图像,进行边缘检测 ...
OpenCL 图像卷积 3 使用 CPU
▶ CPU 图像卷积,共四种方法.分别为基本串行,使用模板,使用局部内存,使用AVX指令优化 ● 全部的代码,仅在主函数中选择调用的函数名即可. #include <stdio.h> #i ...
图像卷积、相关以及在MATLAB中的操作
图像卷积.相关以及在MATLAB中的操作 2016年7月11日 20:34:35, By ChrisZZ 区分卷积和相关图像处理中常常需要用一个滤波器做空间滤波操作.空间滤波操作有时候也被叫做卷积滤 ...
SSE图像算法优化系列十一：使用FFT变换实现图像卷积。
本文重点主要不在于FFT的SSE优化,而在于使用FFT实现快速卷积的相关技巧和过程. 关于FFT变换,有很多参考的代码,特别是对于长度为2的整数次幂的序列,实现起来也是非常简易的,而对于非2次幂的序列 ...
zz图像卷积与滤波的一些知识点
Xinwei: 写的通俗易懂,终于让我这个不搞CV.不搞图像的外行理解卷积和滤波了. 图像卷积与滤波的一些知识点 zouxy09@qq.com http://blog.csdn.net/zouxy09 ...
对抗生成网络-图像卷积-mnist数据生成(代码) 1.tf.layers.conv2d(卷积操作) 2.tf.layers.conv2d_transpose(反卷积操作) 3.tf.layers.batch_normalize(归一化操作) 4.tf.maximum(用于lrelu) 5.tf.train_variable(训练中所有参数) 6.np.random.uniform(生成正态数据
1. tf.layers.conv2d(input, filter, kernel_size, stride, padding) # 进行卷积操作参数说明:input输入数据, filter特征图的 ...
UFLDL教程笔记及练习答案五（自编码线性解码器与处理大型图像**卷积与池化）
自己主动编码线性解码器自己主动编码线性解码器主要是考虑到稀疏自己主动编码器最后一层输出假设用sigmoid函数.因为稀疏自己主动编码器学习是的输出等于输入.simoid函数的值域在[0,1]之间,这 ...
TensorFlow实现图像卷积并可视化示例
图片尺寸要自己修改. 看起来好像没啥意思,不知道下一步能干什么,先卷了再说.由于weights是随机生成的(tf.random_normal作用:用于从服从指定正太分布的数值中取出随机数),所以每次卷 ...
opencv：图像卷积
卷积基本概念 C++代码实现卷积 #include <opencv2/opencv.hpp> #include <iostream> using namespace cv; u ...

随机推荐

dbms_job.submit方式创建job，太老了
--方法一declarejobno number; begin dbms_job.submit(jobno, 'xxxx;', xxxx, 'xxxx'); commit ...
使用POI导入小数变成浮点数异常
例如我在Excel中的数据为17.2, 导入到表中就变成了17.1999999或者17.20000001 原因是我用double接收了17.2,然后直接用了String去转换,精度就丢失了. 代 ...
idea配置echache.xml报错Cannot resolve file 'ehcache.xsd'
解决方法: 打开settings->languages&frameworks->schemas and dtds ,添加地址 http://ehcache.org/ehcache. ...
git 如何关联多个库
git 如何关联多个库在本地同一个库进行操作多个库,可以把代码推送到不同的库中,可以拉取某个库指定的代码,这样更加灵活使用 git remote 查看远程主机名 git remote -v // λ ...
Class.getResource()方法的使用
我们之前使用路径总是有点不知道怎么用,发现别人使用Class.getResource()方法,好像挺不错的样子.于是看看博客,简单学习下. 参考链接:http://blog.csdn.net/lcj8 ...
ajax下载，前端js下载（转）
前面一直做过下载的功能.就是后台将文件流写入response里面,然后就好了.前台会自动弹出下载提示等. 今天打算做一个ajax下载.想当然的结果死活浏览器没反应.我擦. 然后浏览器调试,发现resp ...
Java中的HashMap的工作原理是什么？
问答题23 /120 Java中的HashMap的工作原理是什么? 参考答案 Java中的HashMap是以键值对(key-value)的形式存储元素的.HashMap需要一个hash函数,它使用ha ...
Linux的发行版之间的联系和区别
转载:https://blog.csdn.net/suixin788/article/details/52555558 联系 Linux的内核源代码和Linux的应用程序都可以自由获得,因此很多公司组 ...
阻塞和非阻塞I/O
阻塞和非阻塞I/O是设备访问的两种不同模式,驱动程序可以灵活的支持用户空间对设备的这两种访问形式. 阻塞操作是指在执行设备操作时,若不能获得资源,则挂起进程直到满足可操作的条件后在进行操 ...
PCB 设计文件中哪些可以不做成元件
PCB 文件中哪些可以不做成元件在 PCB 设计时有些元件为了快速,实际并不需要做成元件. 因为做成元件会浪费时间,并且生产时也没有实际的元件. 如下(持续更新): PCB 与 PCB 之前的连接人 ...

OpenCL 图像卷积 1

OpenCL 图像卷积 1的更多相关文章

随机推荐

热门专题