OpenCL 直方图
▶ 计算直方图,由原子计数和规约计算两部分组成
● 最简单的版本,代码
// kernel.cl
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable #define N_BANK 16 // 缓存 bank 数
#define SIZE_OF_BIN 256 // 256 个桶
#define BIT_OF_ELEMENT 8 // 单元素为 8 位,取值范围 0 ~ 255
#define MIN(a,b) ((a) < (b)) ? (a) : (b)
#define MAX(a,b) ((a) > (b)) ? (a) : (b) __kernel __attribute__((reqd_work_group_size(SIZE_OF_BIN, , ))) // 工作组尺寸对齐到桶的个数的整数倍
void histogramKernel(__global uint4 *input, __global uint *Histogram, uint element4PerThread)
{
__local uint subhists[N_BANK * SIZE_OF_BIN];
const uint gid = get_global_id(), lid = get_local_id(), stride = get_global_size();
uint i, idx, lmem_items = N_BANK * SIZE_OF_BIN, lmem_items_per_thread, lmem_max_threads, bin;
uint4 temp, temp2; lmem_max_threads = MAX(, get_local_size() / lmem_items); // 计算局部内存中每个工作项对应的线程数,至少为 1,后面几行没看懂,不改
lmem_max_threads = MAX(, lmem_max_threads / lmem_items); // but no more than we have items
lmem_max_threads = lmem_items / lmem_max_threads; // calculate threads total
lmem_max_threads = MIN(get_local_size(), lmem_max_threads);// but no more than LDS banks
lmem_items_per_thread = lmem_items / lmem_max_threads; if (lid < lmem_max_threads)// 初始化桶
for (i = , idx = lid; i < lmem_items_per_thread / ; subhists[idx] = , i++, idx += lmem_max_threads);
barrier(CLK_LOCAL_MEM_FENCE); for (i = , idx = gid; i < element4PerThread; i++, idx += stride)// 原子计数部分
{
temp = input[idx];
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK;// 取 input[idx] 的低 8 位(4个 0 ~ 255 的数),对准缓存行位置 (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK;// 取 input[idx] 的次低 8 位 (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK; (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK; (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w);
}
barrier(CLK_LOCAL_MEM_FENCE); if (lid < SIZE_OF_BIN)// 规约部分
{
for (i = , bin = ; i<N_BANK; bin += subhists[(lid * N_BANK) + i], i++);// 每个线程负责连续的 N_BANK 个数据,加到一起写入 Histogram
Histogram[(get_group_id() * SIZE_OF_BIN) + lid] = bin;
}
} __kernel void reduceKernel(__global uint *Histogram, uint nSubHists)
{
const uint gid = get_global_id();
int i;
uint bin;
for (i = , bin = ; i < nSubHists; bin += Histogram[(i * SIZE_OF_BIN) + gid], i++);// 每个工作项负责间隔为 SIZE_OF_BIN 的项进行求和
Histogram[gid] = bin; // 结果放入原直方图的头部
}
// main.c
#include <cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <time.h> #define SIZE_OF_BIN 256 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl"; int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度
{
FILE *fp;
int size;
//printf("<readText> File: %s\n", kernelPath);
fopen_s(&fp, kernelPath, "rb");
if (!fp)
{
printf("<readText> Open file failed\n");
getchar();
exit(-);
}
if (fseek(fp, , SEEK_END) != )
{
printf("<readText> Seek end of file failed\n");
getchar();
exit(-);
}
if ((size = ftell(fp)) < )
{
printf("<readText> Get file position failed\n");
getchar();
exit(-);
}
rewind(fp);
if ((*pcode = (char *)malloc(size + )) == NULL)
{
printf("<readText> Allocate space failed\n");
getchar();
exit(-);
}
fread(*pcode, , size, fp);
(*pcode)[size] = '\0';
fclose(fp);
return size + ;
} int main(int argc, char * argv[])
{
_putenv("GPU_DUMP_DEVICE_KERNEL=3");// 在程序目录输出出 il 和 isa 形式的 kernel 文件,可以使用 isa 汇编调试 cl_int status;
cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); char *code;
size_t length = readText(sourceText, &code);
cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, NULL);
status = clBuildProgram(program, , listDevice, NULL, NULL, NULL);
cl_kernel histogram = clCreateKernel(program, "histogramKernel", &status);
cl_kernel reduce = clCreateKernel(program, "reduceKernel", &status); cl_uint nThreads = ; // 64 * 1024
cl_uint nThreadsPerGroup = ; // 原始值:KernelCompileWorkGroupSize[0]
cl_uint nGroups = nThreads / nThreadsPerGroup; cl_uint inputByte = ; // 原始值:DeviceMaxMemAllocSize == 2147483648 == 2^31
cl_uint outputNBytes = nGroups * SIZE_OF_BIN * sizeof(cl_uint);
cl_uint element = inputByte / sizeof(cl_uint);
cl_uint element4 = inputByte / sizeof(cl_uint4);
cl_uint element4PerThread = element4 / nThreads; unsigned int *input = (unsigned int*)malloc(inputByte);
unsigned int *cpuhist = (unsigned int*)malloc(outputNBytes);
unsigned int *gpuhist = (unsigned int*)malloc(outputNBytes);
memset(input, , inputByte);
memset(cpuhist, , outputNBytes);
memset(gpuhist, , outputNBytes); int i;
time_t ltime;
cl_uint a, b;
time(<ime);
for (i = , a = b = (cl_uint)ltime; i < element; i++)// b 的低 16 位乘 a 再加上 b 的高 16 位,赋给新的 b
input[i] = (b = (a * (b & )) + (b >> )); cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputByte, input, &status);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, outputNBytes, gpuhist, &status); clSetKernelArg(histogram, , sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(histogram, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(histogram, , sizeof(cl_uint), (void *)&element4PerThread);
clSetKernelArg(reduce, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(reduce, , sizeof(cl_uint), (void *)&nGroups); size_t globalSizeHist = nThreads, localSizeHist = nThreadsPerGroup;
size_t globalSizeReduce = SIZE_OF_BIN, localSizeReduce = ; clEnqueueNDRangeKernel(queue, histogram, , NULL, &globalSizeHist, &localSizeHist, , NULL, NULL);
clEnqueueNDRangeKernel(queue, reduce, , NULL, &globalSizeReduce, &localSizeReduce, , NULL, NULL);
clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, , outputNBytes / nGroups, gpuhist, , NULL, NULL); for (i = , input; i < element; i++) // 使用 CPU 计算,注意每 8 位看做一个元素,取值范围 0 ~ 255
{
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
} int countError;
for (i = countError = ; i < SIZE_OF_BIN; i++)// 检查结果
{
if (gpuhist[i] != cpuhist[i])
countError++;
}
printf("\n<main> CPU, GPU %s.\n\n\n", countError ? "mismatched" : "matched"); free(listPlatform);
free(listDevice);
free(input);
free(cpuhist);
free(gpuhist);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseKernel(histogram);
clReleaseKernel(reduce);
clReleaseMemObject(inputBuffer);
clReleaseMemObject(outputBuffer);
getchar();
return ;
}
● 输出结果
<main> CPU, GPU matched.
OpenCL 直方图的更多相关文章
- 《OpenCL异构并行编程实战》补充笔记散点,第五至十二章
▶ 第五章,OpenCL 的并发与执行模型 ● 内存对象与上下文相关而不是与设备相关.设备在不同设备之间的移动如下,如果 kernel 在第二个设备上运行,那么在第一个设备上产生的任何数据结果在第二个 ...
- OpenCL与CUDA,CPU与GPU
OpenCL OpenCL(全称Open Computing Language,开放运算语言)是第一个面向异构系统通用目的并行编程的开放式.免费标准,也是一个统一的编程环境,便于软件开发人员为高性能计 ...
- OpenCL C
OpenCL C OpenCL 简介 opencl C是ISO C99的一个扩展,主要区别如下: 去除了C99的一些特性,如:标准C99头文件,函数指针,递归,变长数组,和位域 增加了一些特性用于并 ...
- Oracle索引梳理系列(十)- 直方图使用技巧及analyze table操作对直方图统计的影响(谨慎使用)
版权声明:本文发布于http://www.cnblogs.com/yumiko/,版权由Yumiko_sunny所有,欢迎转载.转载时,请在文章明显位置注明原文链接.若在未经作者同意的情况下,将本文内 ...
- 任意半径局部直方图类算法在PC中快速实现的框架。
在图像处理中,局部算法一般来说,在很大程度上会获得比全局算法更为好的效果,因为他考虑到了图像领域像素的信息,而很多局部算法可以借助于直方图获得加速.同时,一些常规的算法,比如中值滤波.最大值滤波.最小 ...
- [LeetCode] Largest Rectangle in Histogram 直方图中最大的矩形
Given n non-negative integers representing the histogram's bar height where the width of each bar is ...
- 基于SoCkit的opencl实验1-基础例程
基于SoCkit的opencl实验1-基础例程 准备软硬件 Arrow SoCkit Board 4GB or larger microSD Card Quartus II v14.1 SoCEDS ...
- opencv 比较直方图方式 进行人脸检测对比
完整opencv(emgucv)人脸.检测.采集.识别.匹配.对比 //成对几何直方图匹配 public static string MatchHist() ...
- OPenCL
OpenCLhttp://baike.baidu.com/link?url=7uHWCVUYB3Sau_xh3OOKP-A08_IvmT1SJixdAXKezCuCfkzeSQDiSmesGyVGk8 ...
随机推荐
- git add 的一点说明
git add --cached 这里 --cached是什么意思呢?要解释清楚这个问题,我们必须先了解一个文件在git中的状态. [commit]----[stage]-----[checkout] ...
- gitlab访问限制问题------Forbidden
解决方案: cd /etc/gitlab vim /gitlab.rb gitlab_rails['rack_attack_git_basic_auth'] = { 'enabled' => t ...
- supervisor进程管理工具
Supervisor 一个python写的进程管理工具,用来启动.关闭.重启进程,可以同时控制多个进程. 安装: pip install supervisor 配置: 通过配置文件来满足自己的需求 配 ...
- Presto改造
最近在打造一款可视化分析产品, 需要用到组合多数据源, 进行查询, 看了挺多开源的插件, 发现目前只有Presto比较符合, 但是由于Presto没有多用户机制和资源管理, 所以需要在这基本上构建多用 ...
- Leetcode 51
//看了一次解析后,一次AC,用一个pos记录行列.class Solution { public: vector<vector<string>> solveNQueens(i ...
- LeetCode 47
class Solution { public: vector<vector<int>> permuteUnique(vector<int>& nums) ...
- 在请求中使用XML Publisher生成文件报错
在页面上使用按钮生成该文件不报错,但是使用请求就报错. 错误内容如下 Error : No corresponding LOB data found :SELECT L.FILE_DATA FILE_ ...
- springboot模糊查询
在学习MyBatis过程中想实现模糊查询,可惜失败了.后来上百度上查了一下,算是解决了.记录一下MyBatis实现模糊查询的几种方式. 数据库表名为test_student,初始化了几条记录,如图: ...
- webpack 提升90%的构建速度 HardSourceWebpackPlugin
HardSourceWebpackPlugin 插件 不能提升第一次构建的速度,但对于第二次构建能提升99%的构建速度 第一次构建: 第二次: 提升了..,算不出来,反正就是很多啦~~~ npm in ...
- CF1082E:E.increasing Frequency(贪心&最大连续和)
You are given array a a of length n n . You can choose one segment [l,r] [l,r] (1≤l≤r≤n 1≤l≤r≤n ) an ...