▶ 计算直方图,由原子计数和规约计算两部分组成

● 最简单的版本,代码

 // kernel.cl
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable #define N_BANK 16 // 缓存 bank 数
#define SIZE_OF_BIN 256 // 256 个桶
#define BIT_OF_ELEMENT 8 // 单元素为 8 位,取值范围 0 ~ 255
#define MIN(a,b) ((a) < (b)) ? (a) : (b)
#define MAX(a,b) ((a) > (b)) ? (a) : (b) __kernel __attribute__((reqd_work_group_size(SIZE_OF_BIN, , ))) // 工作组尺寸对齐到桶的个数的整数倍
void histogramKernel(__global uint4 *input, __global uint *Histogram, uint element4PerThread)
{
__local uint subhists[N_BANK * SIZE_OF_BIN];
const uint gid = get_global_id(), lid = get_local_id(), stride = get_global_size();
uint i, idx, lmem_items = N_BANK * SIZE_OF_BIN, lmem_items_per_thread, lmem_max_threads, bin;
uint4 temp, temp2; lmem_max_threads = MAX(, get_local_size() / lmem_items); // 计算局部内存中每个工作项对应的线程数,至少为 1,后面几行没看懂,不改
lmem_max_threads = MAX(, lmem_max_threads / lmem_items); // but no more than we have items
lmem_max_threads = lmem_items / lmem_max_threads; // calculate threads total
lmem_max_threads = MIN(get_local_size(), lmem_max_threads);// but no more than LDS banks
lmem_items_per_thread = lmem_items / lmem_max_threads; if (lid < lmem_max_threads)// 初始化桶
for (i = , idx = lid; i < lmem_items_per_thread / ; subhists[idx] = , i++, idx += lmem_max_threads);
barrier(CLK_LOCAL_MEM_FENCE); for (i = , idx = gid; i < element4PerThread; i++, idx += stride)// 原子计数部分
{
temp = input[idx];
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK;// 取 input[idx] 的低 8 位(4个 0 ~ 255 的数),对准缓存行位置 (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK;// 取 input[idx] 的次低 8 位 (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK; (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK; (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w);
}
barrier(CLK_LOCAL_MEM_FENCE); if (lid < SIZE_OF_BIN)// 规约部分
{
for (i = , bin = ; i<N_BANK; bin += subhists[(lid * N_BANK) + i], i++);// 每个线程负责连续的 N_BANK 个数据,加到一起写入 Histogram
Histogram[(get_group_id() * SIZE_OF_BIN) + lid] = bin;
}
} __kernel void reduceKernel(__global uint *Histogram, uint nSubHists)
{
const uint gid = get_global_id();
int i;
uint bin;
for (i = , bin = ; i < nSubHists; bin += Histogram[(i * SIZE_OF_BIN) + gid], i++);// 每个工作项负责间隔为 SIZE_OF_BIN 的项进行求和
Histogram[gid] = bin; // 结果放入原直方图的头部
}
 // main.c
#include <cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <time.h> #define SIZE_OF_BIN 256 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl"; int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度
{
FILE *fp;
int size;
//printf("<readText> File: %s\n", kernelPath);
fopen_s(&fp, kernelPath, "rb");
if (!fp)
{
printf("<readText> Open file failed\n");
getchar();
exit(-);
}
if (fseek(fp, , SEEK_END) != )
{
printf("<readText> Seek end of file failed\n");
getchar();
exit(-);
}
if ((size = ftell(fp)) < )
{
printf("<readText> Get file position failed\n");
getchar();
exit(-);
}
rewind(fp);
if ((*pcode = (char *)malloc(size + )) == NULL)
{
printf("<readText> Allocate space failed\n");
getchar();
exit(-);
}
fread(*pcode, , size, fp);
(*pcode)[size] = '\0';
fclose(fp);
return size + ;
} int main(int argc, char * argv[])
{
_putenv("GPU_DUMP_DEVICE_KERNEL=3");// 在程序目录输出出 il 和 isa 形式的 kernel 文件,可以使用 isa 汇编调试 cl_int status;
cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); char *code;
size_t length = readText(sourceText, &code);
cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, NULL);
status = clBuildProgram(program, , listDevice, NULL, NULL, NULL);
cl_kernel histogram = clCreateKernel(program, "histogramKernel", &status);
cl_kernel reduce = clCreateKernel(program, "reduceKernel", &status); cl_uint nThreads = ; // 64 * 1024
cl_uint nThreadsPerGroup = ; // 原始值:KernelCompileWorkGroupSize[0]
cl_uint nGroups = nThreads / nThreadsPerGroup; cl_uint inputByte = ; // 原始值:DeviceMaxMemAllocSize == 2147483648 == 2^31
cl_uint outputNBytes = nGroups * SIZE_OF_BIN * sizeof(cl_uint);
cl_uint element = inputByte / sizeof(cl_uint);
cl_uint element4 = inputByte / sizeof(cl_uint4);
cl_uint element4PerThread = element4 / nThreads; unsigned int *input = (unsigned int*)malloc(inputByte);
unsigned int *cpuhist = (unsigned int*)malloc(outputNBytes);
unsigned int *gpuhist = (unsigned int*)malloc(outputNBytes);
memset(input, , inputByte);
memset(cpuhist, , outputNBytes);
memset(gpuhist, , outputNBytes); int i;
time_t ltime;
cl_uint a, b;
time(&ltime);
for (i = , a = b = (cl_uint)ltime; i < element; i++)// b 的低 16 位乘 a 再加上 b 的高 16 位,赋给新的 b
input[i] = (b = (a * (b & )) + (b >> )); cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputByte, input, &status);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, outputNBytes, gpuhist, &status); clSetKernelArg(histogram, , sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(histogram, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(histogram, , sizeof(cl_uint), (void *)&element4PerThread);
clSetKernelArg(reduce, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(reduce, , sizeof(cl_uint), (void *)&nGroups); size_t globalSizeHist = nThreads, localSizeHist = nThreadsPerGroup;
size_t globalSizeReduce = SIZE_OF_BIN, localSizeReduce = ; clEnqueueNDRangeKernel(queue, histogram, , NULL, &globalSizeHist, &localSizeHist, , NULL, NULL);
clEnqueueNDRangeKernel(queue, reduce, , NULL, &globalSizeReduce, &localSizeReduce, , NULL, NULL);
clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, , outputNBytes / nGroups, gpuhist, , NULL, NULL); for (i = , input; i < element; i++) // 使用 CPU 计算,注意每 8 位看做一个元素,取值范围 0 ~ 255
{
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
} int countError;
for (i = countError = ; i < SIZE_OF_BIN; i++)// 检查结果
{
if (gpuhist[i] != cpuhist[i])
countError++;
}
printf("\n<main> CPU, GPU %s.\n\n\n", countError ? "mismatched" : "matched"); free(listPlatform);
free(listDevice);
free(input);
free(cpuhist);
free(gpuhist);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseKernel(histogram);
clReleaseKernel(reduce);
clReleaseMemObject(inputBuffer);
clReleaseMemObject(outputBuffer);
getchar();
return ;
}

● 输出结果

<main> CPU, GPU matched.

OpenCL 直方图的更多相关文章

  1. 《OpenCL异构并行编程实战》补充笔记散点,第五至十二章

    ▶ 第五章,OpenCL 的并发与执行模型 ● 内存对象与上下文相关而不是与设备相关.设备在不同设备之间的移动如下,如果 kernel 在第二个设备上运行,那么在第一个设备上产生的任何数据结果在第二个 ...

  2. OpenCL与CUDA,CPU与GPU

    OpenCL OpenCL(全称Open Computing Language,开放运算语言)是第一个面向异构系统通用目的并行编程的开放式.免费标准,也是一个统一的编程环境,便于软件开发人员为高性能计 ...

  3. OpenCL C

    OpenCL C OpenCL  简介 opencl C是ISO C99的一个扩展,主要区别如下: 去除了C99的一些特性,如:标准C99头文件,函数指针,递归,变长数组,和位域 增加了一些特性用于并 ...

  4. Oracle索引梳理系列(十)- 直方图使用技巧及analyze table操作对直方图统计的影响(谨慎使用)

    版权声明:本文发布于http://www.cnblogs.com/yumiko/,版权由Yumiko_sunny所有,欢迎转载.转载时,请在文章明显位置注明原文链接.若在未经作者同意的情况下,将本文内 ...

  5. 任意半径局部直方图类算法在PC中快速实现的框架。

    在图像处理中,局部算法一般来说,在很大程度上会获得比全局算法更为好的效果,因为他考虑到了图像领域像素的信息,而很多局部算法可以借助于直方图获得加速.同时,一些常规的算法,比如中值滤波.最大值滤波.最小 ...

  6. [LeetCode] Largest Rectangle in Histogram 直方图中最大的矩形

    Given n non-negative integers representing the histogram's bar height where the width of each bar is ...

  7. 基于SoCkit的opencl实验1-基础例程

    基于SoCkit的opencl实验1-基础例程 准备软硬件 Arrow SoCkit Board 4GB or larger microSD Card Quartus II v14.1 SoCEDS ...

  8. opencv 比较直方图方式 进行人脸检测对比

    完整opencv(emgucv)人脸.检测.采集.识别.匹配.对比 //成对几何直方图匹配               public static string MatchHist()         ...

  9. OPenCL

    OpenCLhttp://baike.baidu.com/link?url=7uHWCVUYB3Sau_xh3OOKP-A08_IvmT1SJixdAXKezCuCfkzeSQDiSmesGyVGk8 ...

随机推荐

  1. MQ是什么 RabbitMQ

    一.rabbitMQ是什么: RabbitMQ,遵循AMQP协议,由内在高并发的erlanng语言开发,用在实时的对可靠性要求比较高的消息传递上. 学过websocket的来理解rabbitMQ应该是 ...

  2. ContentNegotiatingViewResolver多种输出格式实例: json/jsp/xml/xls/pdf

    ContentNegotiatingViewResolver多种输出格式实例: json/jsp/xml/xls/pdf 本例用的是javaConfig配置 以pizza为例. json输出需要用到的 ...

  3. ORACLE导入导出工具的使用

    ORACLE导出工具exp的使用:  1.将数据库TEST(远程的数据库必须为连接标志符)完全导出,用户名system,密码manager,导出到D:\daochu.dmp中:       exp s ...

  4. RALL资源获取初始化,删除器

    body, table{font-family: 微软雅黑; font-size: 10pt} table{border-collapse: collapse; border: solid gray; ...

  5. 用django发送异步邮件

    太阳底下没有新鲜事,github是一个神奇的地方,你有什么想法,需求,点子.其实别人早就想到,而且也已经做到. 所以不要高估自己,有什么想法还是GITHUB一下,免得成了井底之娃. 这几天一直在研究p ...

  6. C#如何弹出输入框

    在C#中,进行windows窗体应用程序编程的时候,经常需要弹出输入框,输入密码,输入文本之类的.然而,C#中没有直接弹出输入框的语句,MessageBox只能显示一段消息而不能输入.我们需要调用Mi ...

  7. 中兴u880e精简教程

    精简软件请参考此处 (A代表可以删除,B代表建议别删除.)删或留你做主. Accounts AndSyncSettings.apk 账户与同步设置 A alarming.apk 闹钟时钟 A Appl ...

  8. MyEclipse10 中设置Jquery提醒,亲测可用

    最近做练习需要用到Jquery,在myeclipse中默认没有提示功能.然后在网上找解决方案,有一种方案说使用spket,然后搜索安装,折腾了半天还是不行,脑细胞死掉几百个.. 然后在网上搜到另外一种 ...

  9. 公式中表达单个双引号【"】和空值【""】的方法及说明

    http://club.excelhome.net/thread-661904-1-1.html 有人问为什么不用三个双引号"""来表示单个双引号["]呢,如果 ...

  10. linux find查找并拷贝 exec xargs区别

    -exec    1.参数是一个一个传递的,传递一个参数执行一次rm    2.文件名有空格等特殊字符也能处理-xargs     1.一次将参数传给命令,可以使用-n控制参数个数    2.处理特殊 ...