▶ 计算直方图,由原子计数和规约计算两部分组成

● 最简单的版本,代码

 // kernel.cl
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable #define N_BANK 16 // 缓存 bank 数
#define SIZE_OF_BIN 256 // 256 个桶
#define BIT_OF_ELEMENT 8 // 单元素为 8 位,取值范围 0 ~ 255
#define MIN(a,b) ((a) < (b)) ? (a) : (b)
#define MAX(a,b) ((a) > (b)) ? (a) : (b) __kernel __attribute__((reqd_work_group_size(SIZE_OF_BIN, , ))) // 工作组尺寸对齐到桶的个数的整数倍
void histogramKernel(__global uint4 *input, __global uint *Histogram, uint element4PerThread)
{
__local uint subhists[N_BANK * SIZE_OF_BIN];
const uint gid = get_global_id(), lid = get_local_id(), stride = get_global_size();
uint i, idx, lmem_items = N_BANK * SIZE_OF_BIN, lmem_items_per_thread, lmem_max_threads, bin;
uint4 temp, temp2; lmem_max_threads = MAX(, get_local_size() / lmem_items); // 计算局部内存中每个工作项对应的线程数,至少为 1,后面几行没看懂,不改
lmem_max_threads = MAX(, lmem_max_threads / lmem_items); // but no more than we have items
lmem_max_threads = lmem_items / lmem_max_threads; // calculate threads total
lmem_max_threads = MIN(get_local_size(), lmem_max_threads);// but no more than LDS banks
lmem_items_per_thread = lmem_items / lmem_max_threads; if (lid < lmem_max_threads)// 初始化桶
for (i = , idx = lid; i < lmem_items_per_thread / ; subhists[idx] = , i++, idx += lmem_max_threads);
barrier(CLK_LOCAL_MEM_FENCE); for (i = , idx = gid; i < element4PerThread; i++, idx += stride)// 原子计数部分
{
temp = input[idx];
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK;// 取 input[idx] 的低 8 位(4个 0 ~ 255 的数),对准缓存行位置 (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK;// 取 input[idx] 的次低 8 位 (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK; (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w); temp = temp >> BIT_OF_ELEMENT;
temp2 = (temp & (SIZE_OF_BIN - )) * N_BANK + lid % N_BANK; (void)atom_inc(subhists + temp2.x);
(void)atom_inc(subhists + temp2.y);
(void)atom_inc(subhists + temp2.z);
(void)atom_inc(subhists + temp2.w);
}
barrier(CLK_LOCAL_MEM_FENCE); if (lid < SIZE_OF_BIN)// 规约部分
{
for (i = , bin = ; i<N_BANK; bin += subhists[(lid * N_BANK) + i], i++);// 每个线程负责连续的 N_BANK 个数据,加到一起写入 Histogram
Histogram[(get_group_id() * SIZE_OF_BIN) + lid] = bin;
}
} __kernel void reduceKernel(__global uint *Histogram, uint nSubHists)
{
const uint gid = get_global_id();
int i;
uint bin;
for (i = , bin = ; i < nSubHists; bin += Histogram[(i * SIZE_OF_BIN) + gid], i++);// 每个工作项负责间隔为 SIZE_OF_BIN 的项进行求和
Histogram[gid] = bin; // 结果放入原直方图的头部
}
 // main.c
#include <cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <time.h> #define SIZE_OF_BIN 256 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl"; int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度
{
FILE *fp;
int size;
//printf("<readText> File: %s\n", kernelPath);
fopen_s(&fp, kernelPath, "rb");
if (!fp)
{
printf("<readText> Open file failed\n");
getchar();
exit(-);
}
if (fseek(fp, , SEEK_END) != )
{
printf("<readText> Seek end of file failed\n");
getchar();
exit(-);
}
if ((size = ftell(fp)) < )
{
printf("<readText> Get file position failed\n");
getchar();
exit(-);
}
rewind(fp);
if ((*pcode = (char *)malloc(size + )) == NULL)
{
printf("<readText> Allocate space failed\n");
getchar();
exit(-);
}
fread(*pcode, , size, fp);
(*pcode)[size] = '\0';
fclose(fp);
return size + ;
} int main(int argc, char * argv[])
{
_putenv("GPU_DUMP_DEVICE_KERNEL=3");// 在程序目录输出出 il 和 isa 形式的 kernel 文件,可以使用 isa 汇编调试 cl_int status;
cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); char *code;
size_t length = readText(sourceText, &code);
cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, NULL);
status = clBuildProgram(program, , listDevice, NULL, NULL, NULL);
cl_kernel histogram = clCreateKernel(program, "histogramKernel", &status);
cl_kernel reduce = clCreateKernel(program, "reduceKernel", &status); cl_uint nThreads = ; // 64 * 1024
cl_uint nThreadsPerGroup = ; // 原始值:KernelCompileWorkGroupSize[0]
cl_uint nGroups = nThreads / nThreadsPerGroup; cl_uint inputByte = ; // 原始值:DeviceMaxMemAllocSize == 2147483648 == 2^31
cl_uint outputNBytes = nGroups * SIZE_OF_BIN * sizeof(cl_uint);
cl_uint element = inputByte / sizeof(cl_uint);
cl_uint element4 = inputByte / sizeof(cl_uint4);
cl_uint element4PerThread = element4 / nThreads; unsigned int *input = (unsigned int*)malloc(inputByte);
unsigned int *cpuhist = (unsigned int*)malloc(outputNBytes);
unsigned int *gpuhist = (unsigned int*)malloc(outputNBytes);
memset(input, , inputByte);
memset(cpuhist, , outputNBytes);
memset(gpuhist, , outputNBytes); int i;
time_t ltime;
cl_uint a, b;
time(&ltime);
for (i = , a = b = (cl_uint)ltime; i < element; i++)// b 的低 16 位乘 a 再加上 b 的高 16 位,赋给新的 b
input[i] = (b = (a * (b & )) + (b >> )); cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inputByte, input, &status);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, outputNBytes, gpuhist, &status); clSetKernelArg(histogram, , sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(histogram, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(histogram, , sizeof(cl_uint), (void *)&element4PerThread);
clSetKernelArg(reduce, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(reduce, , sizeof(cl_uint), (void *)&nGroups); size_t globalSizeHist = nThreads, localSizeHist = nThreadsPerGroup;
size_t globalSizeReduce = SIZE_OF_BIN, localSizeReduce = ; clEnqueueNDRangeKernel(queue, histogram, , NULL, &globalSizeHist, &localSizeHist, , NULL, NULL);
clEnqueueNDRangeKernel(queue, reduce, , NULL, &globalSizeReduce, &localSizeReduce, , NULL, NULL);
clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, , outputNBytes / nGroups, gpuhist, , NULL, NULL); for (i = , input; i < element; i++) // 使用 CPU 计算,注意每 8 位看做一个元素,取值范围 0 ~ 255
{
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
cpuhist[(input[i] >> ) & 0xff]++;
} int countError;
for (i = countError = ; i < SIZE_OF_BIN; i++)// 检查结果
{
if (gpuhist[i] != cpuhist[i])
countError++;
}
printf("\n<main> CPU, GPU %s.\n\n\n", countError ? "mismatched" : "matched"); free(listPlatform);
free(listDevice);
free(input);
free(cpuhist);
free(gpuhist);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseKernel(histogram);
clReleaseKernel(reduce);
clReleaseMemObject(inputBuffer);
clReleaseMemObject(outputBuffer);
getchar();
return ;
}

● 输出结果

<main> CPU, GPU matched.

OpenCL 直方图的更多相关文章

  1. 《OpenCL异构并行编程实战》补充笔记散点,第五至十二章

    ▶ 第五章,OpenCL 的并发与执行模型 ● 内存对象与上下文相关而不是与设备相关.设备在不同设备之间的移动如下,如果 kernel 在第二个设备上运行,那么在第一个设备上产生的任何数据结果在第二个 ...

  2. OpenCL与CUDA,CPU与GPU

    OpenCL OpenCL(全称Open Computing Language,开放运算语言)是第一个面向异构系统通用目的并行编程的开放式.免费标准,也是一个统一的编程环境,便于软件开发人员为高性能计 ...

  3. OpenCL C

    OpenCL C OpenCL  简介 opencl C是ISO C99的一个扩展,主要区别如下: 去除了C99的一些特性,如:标准C99头文件,函数指针,递归,变长数组,和位域 增加了一些特性用于并 ...

  4. Oracle索引梳理系列(十)- 直方图使用技巧及analyze table操作对直方图统计的影响(谨慎使用)

    版权声明:本文发布于http://www.cnblogs.com/yumiko/,版权由Yumiko_sunny所有,欢迎转载.转载时,请在文章明显位置注明原文链接.若在未经作者同意的情况下,将本文内 ...

  5. 任意半径局部直方图类算法在PC中快速实现的框架。

    在图像处理中,局部算法一般来说,在很大程度上会获得比全局算法更为好的效果,因为他考虑到了图像领域像素的信息,而很多局部算法可以借助于直方图获得加速.同时,一些常规的算法,比如中值滤波.最大值滤波.最小 ...

  6. [LeetCode] Largest Rectangle in Histogram 直方图中最大的矩形

    Given n non-negative integers representing the histogram's bar height where the width of each bar is ...

  7. 基于SoCkit的opencl实验1-基础例程

    基于SoCkit的opencl实验1-基础例程 准备软硬件 Arrow SoCkit Board 4GB or larger microSD Card Quartus II v14.1 SoCEDS ...

  8. opencv 比较直方图方式 进行人脸检测对比

    完整opencv(emgucv)人脸.检测.采集.识别.匹配.对比 //成对几何直方图匹配               public static string MatchHist()         ...

  9. OPenCL

    OpenCLhttp://baike.baidu.com/link?url=7uHWCVUYB3Sau_xh3OOKP-A08_IvmT1SJixdAXKezCuCfkzeSQDiSmesGyVGk8 ...

随机推荐

  1. Vue.js系列之项目搭建(1)

    项目搭建具体步骤如下: 1.安装node 到官网下载安装,我这里是win7系统. (中)https://nodejs.org/zh-cn/ (英)https://nodejs.org/en/ 2.安装 ...

  2. 《高级Web应用程序设计》课程学习(20170911)

    一.课程内容 本学期课件,点击查看 二.作业相关 上交作业的方法 访问ftp://192.168.42.254:22,登录后找到自己的姓名文件夹,放入作业即可.登录账号为stu1,密码为空 作业列表, ...

  3. run jdeveloper, unable to create an instance of the Java Virtual Machine Located at path:

    刚才打开 jdevW.exe 时提示如下错误: Unable to create an instance of the Java Virtual MachineLocated at path:x:\x ...

  4. JS类型转换之valueOf和toString详解

    最近群里有人发了下面这题: 实现一个函数,运算结果可以满足如下预期结果: add(1)(2)// 3 add(1,2,3)(10)// 16 add(1)(2)(3)(4)(5)// 15 对于一个好 ...

  5. XML——概述

    body, table{font-family: 微软雅黑; font-size: 10pt} table{border-collapse: collapse; border: solid gray; ...

  6. 硬盘安装CentOS 6.0(超级详细图文教程)

    硬盘安装CentOS 6.0(超级详细图文教程) 来源:   引言: 电脑系统是Windows XP,电脑没有光驱.手头没有U盘.没有移动硬盘.电脑主板不支持U盘启动,在这种情况下想安装CentOS ...

  7. How to get the full error stack trace of SharePoint

    博客地址 http://blog.csdn.net/foxdave SharePoint开发,怎么得到真实的详细错误信息. 大家在开发遇到页面报错需要提问的时候,先将详细错误信息获取到再提问,谢谢. ...

  8. Protel画完原理图检查的时候出现了这些错误 #1 Error Multiple Net Identifiers

    Error Report For : Documents\Sheet1.Sch 24-Aug-2009 14:58:43 #1 Error Multiple Net Identifiers : She ...

  9. Django 之 富文本编辑器-tinymce

    这里的富文本编辑器以 tinymce 为例. 环境:ubuntu 16.04 + django 1.10 + python 2.7 ubuntu安装tinymce: python 2.7 $ sudo ...

  10. Centos 中扩展 软件源 的安装 之 epel ( 为yum 扩展软件源 EPEL源 )

    EPEL (Extra Packages for Enterprise Linux)是基于Fedora的一个项目,为“红帽系”的操作系统提供额外的软件包,适用于RHEL.CentOS和Scientif ...