▶ 计算直方图,由原子计数和规约计算两部分组成

● 最简单的版本,代码

 // kernel.cl
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable #define N_BANK 16 // 缓存 bank 数
#define SIZE_OF_BIN 256 // 256 个桶
#define BIT_OF_ELEMENT 8 // 单元素为 8 位,取值范围 0 ~ 255
#define MIN(a,b) ((a) < (b)) ? (a) : (b)
#define MAX(a,b) ((a) > (b)) ? (a) : (b) __kernel __attribute__((reqd_work_group_size(SIZE_OF_BIN, , ))) // 工作组尺寸对齐到桶的个数的整数倍
void histogramKernel(__global uint4 *input, __global uint *Histogram, uint element4PerThread)
{
__local uint subhists[N_BANK * SIZE_OF_BIN];
const uint gid = get_global_id(), lid = get_local_id(), stride = get_global_size();
uint i, idx, lmem_items = N_BANK * SIZE_OF_BIN, lmem_items_per_thread, lmem_max_threads, bin;
uint4 temp, temp2; lmem_max_threads = MAX(, get_local_size() / lmem_items); // 计算局部内存中每个工作项对应的线程数,至少为 1,后面几行没看懂,不改
lmem_max_threads = MAX(, lmem_max_threads / lmem_items); // but no more than we have items
lmem_max_threads = lmem_items / lmem_max_threads; // calculate threads total
lmem_max_threads = MIN(get_local_size(), lmem_max_threads);// but no more than LDS banks
lmem_items_per_thread = lmem_items / lmem_max_threads; if (lid < lmem_max_threads)// 初始化桶
for (i = , idx = lid; i < lmem_items_per_thread / ; subhists[idx] = , i++, idx += lmem_max_threads);
barrier(CLK_LOCAL_MEM_FENCE); for (i = , idx = gid; i < element4PerThread; i++, idx += stride)// 原子计数部分
{
temp = input[idx];
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK;// 取 input[idx] 的低 8 位(4个 0 ~ 255 的数),对准缓存行位置 (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK;// 取 input[idx] 的次低 8 位 (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK; (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK; (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w);
}
barrier(CLK_LOCAL_MEM_FENCE); if (lid < SIZE_OF_BIN)// 规约部分
{
for (i = , bin = ; i<N_BANK; bin += subhists[(lid * N_BANK) + i], i++);// 每个线程负责连续的 N_BANK 个数据,加到一起写入 Histogram
Histogram[(get_group_id() * SIZE_OF_BIN) + lid] = bin;
}
} __kernel void reduceKernel(__global uint *Histogram, uint nSubHists)
{
const uint gid = get_global_id();
int i;
uint bin;
for (i = , bin = ; i < nSubHists; bin += Histogram[(i * SIZE_OF_BIN) + gid], i++);// 每个工作项负责间隔为 SIZE_OF_BIN 的项进行求和
Histogram[gid] = bin; // 结果放入原直方图的头部
}
 // main.c
#include <cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <time.h> #define SIZE_OF_BIN 256 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl"; int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度
{
FILE *fp;
int size;
//printf("<readText> File: %s\n", kernelPath);
fopen_s(&fp, kernelPath, "rb");
if (!fp)
{
printf("<readText> Open file failed\n");
getchar();
exit(-);
}
if (fseek(fp, , SEEK_END) != )
{
printf("<readText> Seek end of file failed\n");
getchar();
exit(-);
}
if ((size = ftell(fp)) < )
{
printf("<readText> Get file position failed\n");
getchar();
exit(-);
}
rewind(fp);
if ((*pcode = (char *)malloc(size + )) == NULL)
{
printf("<readText> Allocate space failed\n");
getchar();
exit(-);
}
fread(*pcode, , size, fp);
(*pcode)[size] = '\0';
fclose(fp);
return size + ;
} int main(int argc, char * argv[])
{
_putenv("GPU_DUMP_DEVICE_KERNEL=3");// 在程序目录输出出 il 和 isa 形式的 kernel 文件,可以使用 isa 汇编调试 cl_int status;
cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); char *code;
size_t length = readText(sourceText, &code);
cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, NULL);
status = clBuildProgram(program, , listDevice, NULL, NULL, NULL);
cl_kernel histogram = clCreateKernel(program, "histogramKernel", &status);
cl_kernel reduce = clCreateKernel(program, "reduceKernel", &status); cl_uint nThreads = ; // 64 * 1024
cl_uint nThreadsPerGroup = ; // 原始值:KernelCompileWorkGroupSize[0]
cl_uint nGroups = nThreads / nThreadsPerGroup; cl_uint inputByte = ; // 原始值:DeviceMaxMemAllocSize == 2147483648 == 2^31
cl_uint outputNBytes = nGroups * SIZE_OF_BIN * sizeof(cl_uint);
cl_uint element = inputByte / sizeof(cl_uint);
cl_uint element4 = inputByte / sizeof(cl_uint4);
cl_uint element4PerThread = element4 / nThreads; unsigned int *input = (unsigned int*)malloc(inputByte);
unsigned int *cpuhist = (unsigned int*)malloc(outputNBytes);
unsigned int *gpuhist = (unsigned int*)malloc(outputNBytes);
memset(input, , inputByte);
memset(cpuhist, , outputNBytes);
memset(gpuhist, , outputNBytes); int i;
time_t ltime;
cl_uint a, b;
time(&ltime);
for (i = , a = b = (cl_uint)ltime; i < element; i++)// b 的低 16 位乘 a 再加上 b 的高 16 位,赋给新的 b
input[i] = (b = (a * (b & )) + (b >> )); cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputByte, input, &status);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, outputNBytes, gpuhist, &status); clSetKernelArg(histogram, , sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(histogram, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(histogram, , sizeof(cl_uint), (void *)&element4PerThread);
clSetKernelArg(reduce, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(reduce, , sizeof(cl_uint), (void *)&nGroups); size_t globalSizeHist = nThreads, localSizeHist = nThreadsPerGroup;
size_t globalSizeReduce = SIZE_OF_BIN, localSizeReduce = ; clEnqueueNDRangeKernel(queue, histogram, , NULL, &globalSizeHist, &localSizeHist, , NULL, NULL);
clEnqueueNDRangeKernel(queue, reduce, , NULL, &globalSizeReduce, &localSizeReduce, , NULL, NULL);
clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, , outputNBytes / nGroups, gpuhist, , NULL, NULL); for (i = , input; i < element; i++) // 使用 CPU 计算,注意每 8 位看做一个元素,取值范围 0 ~ 255
{
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
} int countError;
for (i = countError = ; i < SIZE_OF_BIN; i++)// 检查结果
{
if (gpuhist[i] != cpuhist[i])
countError++;
}
printf("\n<main> CPU, GPU %s.\n\n\n", countError ? "mismatched" : "matched"); free(listPlatform);
free(listDevice);
free(input);
free(cpuhist);
free(gpuhist);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseKernel(histogram);
clReleaseKernel(reduce);
clReleaseMemObject(inputBuffer);
clReleaseMemObject(outputBuffer);
getchar();
return ;
}

● 输出结果

<main> CPU, GPU matched.

OpenCL 直方图的更多相关文章

  1. 《OpenCL异构并行编程实战》补充笔记散点,第五至十二章

    ▶ 第五章,OpenCL 的并发与执行模型 ● 内存对象与上下文相关而不是与设备相关.设备在不同设备之间的移动如下,如果 kernel 在第二个设备上运行,那么在第一个设备上产生的任何数据结果在第二个 ...

  2. OpenCL与CUDA,CPU与GPU

    OpenCL OpenCL(全称Open Computing Language,开放运算语言)是第一个面向异构系统通用目的并行编程的开放式.免费标准,也是一个统一的编程环境,便于软件开发人员为高性能计 ...

  3. OpenCL C

    OpenCL C OpenCL  简介 opencl C是ISO C99的一个扩展,主要区别如下: 去除了C99的一些特性,如:标准C99头文件,函数指针,递归,变长数组,和位域 增加了一些特性用于并 ...

  4. Oracle索引梳理系列(十)- 直方图使用技巧及analyze table操作对直方图统计的影响(谨慎使用)

    版权声明:本文发布于http://www.cnblogs.com/yumiko/,版权由Yumiko_sunny所有,欢迎转载.转载时,请在文章明显位置注明原文链接.若在未经作者同意的情况下,将本文内 ...

  5. 任意半径局部直方图类算法在PC中快速实现的框架。

    在图像处理中,局部算法一般来说,在很大程度上会获得比全局算法更为好的效果,因为他考虑到了图像领域像素的信息,而很多局部算法可以借助于直方图获得加速.同时,一些常规的算法,比如中值滤波.最大值滤波.最小 ...

  6. [LeetCode] Largest Rectangle in Histogram 直方图中最大的矩形

    Given n non-negative integers representing the histogram's bar height where the width of each bar is ...

  7. 基于SoCkit的opencl实验1-基础例程

    基于SoCkit的opencl实验1-基础例程 准备软硬件 Arrow SoCkit Board 4GB or larger microSD Card Quartus II v14.1 SoCEDS ...

  8. opencv 比较直方图方式 进行人脸检测对比

    完整opencv(emgucv)人脸.检测.采集.识别.匹配.对比 //成对几何直方图匹配               public static string MatchHist()         ...

  9. OPenCL

    OpenCLhttp://baike.baidu.com/link?url=7uHWCVUYB3Sau_xh3OOKP-A08_IvmT1SJixdAXKezCuCfkzeSQDiSmesGyVGk8 ...

随机推荐

  1. Python - Learn Note (2)

    Python注释 Python的注释以#开头,后面的文字直到行尾都算注释 Python基本数据类型 整数.浮点数(浮点数也就是小数,之所以称为浮点数,是因为按照科学记数法表示时,一个浮点数的小数点位置 ...

  2. 硬盘安装CentOS 6.0(超级详细图文教程)

    硬盘安装CentOS 6.0(超级详细图文教程) 来源:   引言: 电脑系统是Windows XP,电脑没有光驱.手头没有U盘.没有移动硬盘.电脑主板不支持U盘启动,在这种情况下想安装CentOS ...

  3. 『转』Dr.Web Security Space 8 – 免费3个月

    简短的测试五个问题,任意回答问题,都将获得Dr.Web Security Suite 3个月免费许可证以及大蜘蛛企业安全套件2个月来保护整个公司!活动地址:https://www.drweb.com/ ...

  4. 《转》深入理解Activity启动流程(二)–Activity启动相关类的类图

    本文原创作者:Cloud Chou. 出处:本文链接 本系列博客将详细阐述Activity的启动流程,这些博客基于Cm 10.1源码研究. 在介绍Activity的详细启动流程之前,先为大家介绍Act ...

  5. 理解HTTP之Content-Type

    http://homeway.me/2015/07/19/understand-http-about-content-type/

  6. 把字符串中的空格替换为"%20"

    这个需要注意的是字符串的结尾最后一个字符为'\0',并不是空字符,复制时要一块复制,算法思想就是先计算出字符串中总的空格数,然后 重新计算字符串的长度,由于"%20"为3个字符,比 ...

  7. python3.5 安装 numpy1.14.4

    AMD64 import pip._internal print(pip._internal.pep425tags.get_supported()) WIN32 import pip print(pi ...

  8. 白帽子讲web安全——一个安全解决方案的诞生细节

    1.白帽子:做安全的人.主要做的事,防御,是制定一套解决攻击的方案.而不是只是解决某个漏洞. 2.黑帽子:现在说的黑客.让web变的不安全的人.利用漏洞获取特权.主要做的事,攻击,组合各种方法利用漏洞 ...

  9. Mac: iTerm2使用

    From: http://www.cnblogs.com/noTice520/p/3190529.html 之前一直有朋友要我分享下在用的mac软件,今天有空就来写一下,可能不止于软件,会有一些配置或 ...

  10. System.IO.Path类

    System.IO.Path为路径的操作封装了很多很有的东西,利用该类提供的方法能够快速处理路径操作的问题.下面详细了解一下. 1.属性 属性太复杂了,反映什么系统平台的信息,看不懂,等以后看得懂了再 ...