OpenCL 前缀和
▶ 向量前缀和
● 最简单的版本,代码
#include <stdio.h>
#include <stdlib.h>
#include <cl.h> const char *programSource = " \
__kernel void prefixSum(__global int *input, int n) \
{ \
int offset, level, idx, lp, rp, temp; \
for (offset = , level = n / ; level > ; offset *= , level /= ) \
{ \
barrier(CLK_LOCAL_MEM_FENCE); \
for (idx = get_local_id(); idx < level; idx += get_local_size()) \
{ \
lp = offset * ( * idx + ) - ; \
rp = offset * ( * idx + ) - ; \
input[rp] = input[lp] + input[rp]; \
} \
} \
barrier(CLK_LOCAL_MEM_FENCE); \
if (get_local_id() == ) \
input[n - ] = ; \
\
for (level = ; level < n; level *= ) \
{ \
offset /= ; \
barrier(CLK_LOCAL_MEM_FENCE); \
for (idx = get_local_id(); idx < level; idx += get_local_size()) \
{ \
lp = offset * ( * idx + ) - ; \
rp = offset * ( * idx + ) - ; \
temp = input[lp]; \
input[lp] = input[rp]; \
input[rp] += temp; \
} \
} \
} \
"; int main()
{
const size_t nElement = , nWorkItem = , datasize = sizeof(int) * nElement;
const int n = nElement;
int i, *A, *C;
cl_int status; A = (int*)malloc(datasize);
C = (int*)malloc(datasize);
for (i = ; i < nElement; A[i] = , i++);
for (C[] = , i = ; i < nElement; C[i] = C[i - ] + A[i - ], i++); // 开前缀和
//for (C[0] = A[0], i = 1; i < nElement; C[i] = C[i - 1] + A[i], i++); // 闭前缀和 cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_WRITE, datasize, NULL, &status);
clEnqueueWriteBuffer(queue, bufferA, CL_TRUE, , datasize, A, , NULL, NULL); cl_program program = clCreateProgramWithSource(context, , (const char**)&programSource, NULL, &status);
clBuildProgram(program, nDevice, listDevice, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "prefixSum", &status); clSetKernelArg(kernel, , sizeof(cl_mem), &bufferA);
clSetKernelArg(kernel, , sizeof(cl_int), &n);
status = clEnqueueNDRangeKernel(queue, kernel, , NULL, &nElement, &nWorkItem, , NULL, NULL);
clFinish(queue); clEnqueueReadBuffer(queue, bufferA, CL_TRUE, , datasize, A, , NULL, NULL);
for (i = ; i < nElement; i++)
{
if (A[i] != C[i])
break;
}
printf("Output is %s, %d.\n", (i == nElement) ? "correct" : "incorrect", i); free(A);
free(C);
free(listPlatform);
free(listDevice);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseMemObject(bufferA);
clReleaseProgram(program);
clReleaseKernel(kernel);
getchar();
return ;
}
● 输出结果
Output is correct, .
● 核函数说明
// 代码说明
void prefixSum(int *input, int n)
{
int offset, level, idx, lp, rp, temp;
for (offset = , level = n / ; level > ; offset *= , level /= )// level 为当前循环写入元素个数,每次循环处理所有跨度为 offset 的元素
{
barrier(CLK_LOCAL_MEM_FENCE);
for (idx = get_local_id(); idx < level; idx += get_local_size())// 每次循环处理 get_local_size(0) 对元素,靠前元素加到靠后元素上
{
lp = offset * ( * idx + ) - ;
rp = offset * ( * idx + ) - ;
input[rp] += input[lp];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id() == )// 主线程将最后一个元素置零,之后要向前传,注意到这里时 offset == n
input[n - ] = ;
for (level = ; level < n; level *= )// level 为当前循环写入元素对数,每次循环处理所有跨度为 offset 的元素
{
offset /= ;
barrier(CLK_LOCAL_MEM_FENCE);
for (idx = get_local_id(); idx < level; idx += get_local_size())// 每次循环处理 get_local_size(0) 对元素,靠前元素赋为靠后元素,靠后元素等于两者原值之和
{
lp = offset * ( * idx + ) - ;
rp = offset * ( * idx + ) - ;
temp = input[lp];
input[lp] = input[rp];
input[rp] += temp;
}
}
}
● 使用局部内存优化吗,C++ 版本
// kernel.cl
__kernel void reduce(__global uint4* input, __global uint4* output, int NUM)
{
__local uint4 resArray[];
unsigned int lid = get_local_id(), globalSize = get_global_size();
int i;
uint4 res = (uint4) { , , , }; for (i = get_global_id(); i < NUM / ; res += input[i], i += globalSize);// 在局部内存中做第一次规约
resArray[lid] = res;
barrier(CLK_LOCAL_MEM_FENCE); for (i = get_local_size() >> ; i > ; i >>= )// 分发
{
if (lid < i)
resArray[lid] += resArray[lid + i];
barrier(CLK_LOCAL_MEM_FENCE);
} if (lid == )
output[get_group_id()] = resArray[];
}
// main.cpp
#include <cl.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream> const char *filename = "kernel.cl"; using namespace std; int isVerify(int NUM, int groupNUM, int *res)// 校验结果
{
int i, sum;
for (i = sum = ; i < groupNUM * ; sum += res[i++]);
return sum == (NUM + ) * NUM / ;
} char* readSource(const char *filename)// 读取代码文件,C++ 版本
{
std::fstream f(filename, (std::fstream::in | std::fstream::binary));
if (!f.is_open())
{
cout << "<readSource> Error open file " << filename << "\n" << endl;
return nullptr;
}
f.seekg(, std::fstream::end);
size_t size = (size_t)f.tellg();
f.seekg(, std::fstream::beg);
char *str = new char[size + ];
if (str == nullptr)
{
cout << "<readSource> Error malloc memory\n" << endl;
f.close();
return nullptr;
}
f.read(str, size);
f.close();
str[size] = '\0';
return str;
} int main(int argc, char* argv[])
{
cl_int status;
cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); const char *source = readSource(filename);
const size_t sourceSize = strlen(source);
cl_program program = clCreateProgramWithSource(context, , &source, &sourceSize, NULL);
clBuildProgram(program, , listDevice, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "reduce", NULL); const int NUM = * ;
size_t global_work_size[] = { };
size_t local_work_size[] = { };
size_t groupNUM = global_work_size[] / local_work_size[];
int* input = new int[NUM];
for (int i = ; i < NUM; input[i] = i + , i++);
int* output = new int[(global_work_size[] / local_work_size[]) * ];
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (NUM) * sizeof(int), (void *)input, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, groupNUM * * sizeof(int), NULL, NULL); clSetKernelArg(kernel, , sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(kernel, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(kernel, , sizeof(int), &NUM);
cl_event enentPoint;
clEnqueueNDRangeKernel(queue, kernel, , NULL, global_work_size, local_work_size, , NULL, &enentPoint);
clWaitForEvents(, &enentPoint);
clReleaseEvent(enentPoint); clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, , groupNUM * * sizeof(int), output, , NULL, NULL);
cout << "The result is " << (isVerify(NUM, groupNUM, output) ? "Correct" : "Error") << ".\n" << endl; free(input);
free(output);
free(listDevice);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(inputBuffer);
clReleaseMemObject(outputBuffer);
getchar();
return ;
}
● 输出结果
The result is correct.
OpenCL 前缀和的更多相关文章
- 基于SoCkit的opencl实验1-基础例程
基于SoCkit的opencl实验1-基础例程 准备软硬件 Arrow SoCkit Board 4GB or larger microSD Card Quartus II v14.1 SoCEDS ...
- OpenCL与CUDA,CPU与GPU
OpenCL OpenCL(全称Open Computing Language,开放运算语言)是第一个面向异构系统通用目的并行编程的开放式.免费标准,也是一个统一的编程环境,便于软件开发人员为高性能计 ...
- Android AARCH64 平台的 OpenCL 配置
原文地址:Android AARCH64 平台的 OpenCL 配置 Android AARCH64 平台的 OpenCL 配置 开发环境 IDE: Android Studio 3.4.1 Andr ...
- HDU1671——前缀树的一点感触
题目http://acm.hdu.edu.cn/showproblem.php?pid=1671 题目本身不难,一棵前缀树OK,但是前两次提交都没有成功. 第一次Memory Limit Exceed ...
- 【手记】注意BinaryWriter写string的小坑——会在string前加上长度前缀length-prefixed
之前以为BinaryWriter写string会严格按构造时指定的编码(不指定则是无BOM的UTF8)写入string的二进制,如下面的代码: //将字符串"a"写入流,再拿到流的 ...
- ASP.NET Core MVC 配置全局路由前缀
前言 大家好,今天给大家介绍一个 ASP.NET Core MVC 的一个新特性,给全局路由添加统一前缀.严格说其实不算是新特性,不过是Core MVC特有的. 应用背景 不知道大家在做 Web Ap ...
- 如何处理CSS3属性前缀
今天闲来无聊,重新来说说CSS3前缀的问题.在春节前和@一丝姐姐说起Sass中有关于gradient的mixins.姐姐说: 为什么还要用mixin呢?为什么不使用Autoprefixer?使用Aut ...
- context:component-scan" 的前缀 "context" 未绑定。
SpElUtilTest.testSpELLiteralExpressiontestSpELLiteralExpression(cn.zr.spring.spel.SpElUtilTest)org.s ...
- 解决adobe air sdk打包 apk后自动在包名前面加上air. (有个点)前缀的问题
早就找到了这个方法,但是一直忙没心思写博客. 默认情况下,所有 AIR Android 应用程序的包名称都带 air 前缀.若不想使用此默认行为,可将计算机环境变量 AIR_NOANDROIDFLAI ...
随机推荐
- JavaScript--语法2--语句结构
JavaScript--语句结构 一.心得 判断的时候常量放左边java中switch只支持四种类型,javaScript是弱类型,所有的都支持. 显示方法: 77 // alert("x= ...
- webSocket协议与Socket的区别
WebSocket介绍与原理WebSocket protocol 是HTML5一种新的协议.它实现了浏览器与服务器全双工通信(full-duplex).一开始的握手需要借助HTTP请求完成. ——百度 ...
- 豆知识扩展:HTML<meta> tag
豆知识: HTML<meta> tag Metadata 是关于数据的信息. The <meta> tag provides metadata关于网页.Metadat不会显示在 ...
- poj3436网络流之最大流拆点
这题看了半天看不懂题意...还是看的网上题意写的 加一个源点一个汇点,把每个点拆成两个,这两个点的流量是v,其他联通的边都设为无穷大 输入没有1的点就与源点连接,输出只有1的点就与汇点连接 还有这个输 ...
- PowerDesigner16工具学习笔记-建立BPM
根据不同用途,BPM分为分析性(Analysis).执行型(Executable)和协作型(Collaborative) BPM的类型 业务流程语言 描述 分析型 Analysis 提供流程层次 ...
- 12.2 Web窗体--代码片段详解
第12章 使用Web窗体 ※ 除常规HTML元素之外,Web窗体文件还包含另外3种内容:代码片段.可编程HTML元素和控件 ※ 代码隐藏类只应包含特定于单个Web窗体的代码.如果存在多个Web窗体 ...
- 115. Distinct Subsequences *HARD* -- 字符串不连续匹配
Given a string S and a string T, count the number of distinct subsequences of T in S. A subsequence ...
- vue 父组件传递数据给子组件
父组件 <body> <div id="app"> <child v-bind:data = "test"></chi ...
- c/c++指针常见错误
一 #include <bits/stdc++.h> using namespace std; void f(char *str) { char *s = str; str[] = ' / ...
- Alpha阶段敏捷冲刺---Day1
一.Daily Scrum Meeting照片 二.今天冲刺情况反馈 1.昨天已完成的工作 昨天我们组全体成员在五社区五号楼719召开了紧急会议,在会议上我们梳理了编写这个程序的所有流程,并且根 ...