▶ 向量前缀和

● 最简单的版本,代码

 #include <stdio.h>
#include <stdlib.h>
#include <cl.h> const char *programSource = " \
__kernel void prefixSum(__global int *input, int n) \
{ \
int offset, level, idx, lp, rp, temp; \
for (offset = , level = n / ; level > ; offset *= , level /= ) \
{ \
for (idx = get_local_id(); idx < level; idx += get_local_size()) \
{ \
lp = offset * ( * idx + ) - ; \
rp = offset * ( * idx + ) - ; \
input[rp] = input[lp] + input[rp]; \
} \
} \
if (get_local_id() == ) \
input[n - ] = ; \
for (level = ; level < n; level *= ) \
{ \
offset /= ; \
for (idx = get_local_id(); idx < level; idx += get_local_size()) \
{ \
lp = offset * ( * idx + ) - ; \
rp = offset * ( * idx + ) - ; \
temp = input[lp]; \
input[lp] = input[rp]; \
input[rp] += temp; \
} \
} \
} \
"; int main()
const size_t nElement = , nWorkItem = , datasize = sizeof(int) * nElement;
const int n = nElement;
int i, *A, *C;
cl_int status; A = (int*)malloc(datasize);
C = (int*)malloc(datasize);
for (i = ; i < nElement; A[i] = , i++);
for (C[] = , i = ; i < nElement; C[i] = C[i - ] + A[i - ], i++); // 开前缀和
//for (C[0] = A[0], i = 1; i < nElement; C[i] = C[i - 1] + A[i], i++); // 闭前缀和 cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_WRITE, datasize, NULL, &status);
clEnqueueWriteBuffer(queue, bufferA, CL_TRUE, , datasize, A, , NULL, NULL); cl_program program = clCreateProgramWithSource(context, , (const char**)&programSource, NULL, &status);
clBuildProgram(program, nDevice, listDevice, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "prefixSum", &status); clSetKernelArg(kernel, , sizeof(cl_mem), &bufferA);
clSetKernelArg(kernel, , sizeof(cl_int), &n);
status = clEnqueueNDRangeKernel(queue, kernel, , NULL, &nElement, &nWorkItem, , NULL, NULL);
clFinish(queue); clEnqueueReadBuffer(queue, bufferA, CL_TRUE, , datasize, A, , NULL, NULL);
for (i = ; i < nElement; i++)
if (A[i] != C[i])
printf("Output is %s, %d.\n", (i == nElement) ? "correct" : "incorrect", i); free(A);
return ;

● 输出结果

Output is correct, .

● 核函数说明

 // 代码说明
void prefixSum(int *input, int n)
int offset, level, idx, lp, rp, temp;
for (offset = , level = n / ; level > ; offset *= , level /= )// level 为当前循环写入元素个数,每次循环处理所有跨度为 offset 的元素
for (idx = get_local_id(); idx < level; idx += get_local_size())// 每次循环处理 get_local_size(0) 对元素,靠前元素加到靠后元素上
lp = offset * ( * idx + ) - ;
rp = offset * ( * idx + ) - ;
input[rp] += input[lp];
if (get_local_id() == )// 主线程将最后一个元素置零,之后要向前传,注意到这里时 offset == n
input[n - ] = ;
for (level = ; level < n; level *= )// level 为当前循环写入元素对数,每次循环处理所有跨度为 offset 的元素
offset /= ;
for (idx = get_local_id(); idx < level; idx += get_local_size())// 每次循环处理 get_local_size(0) 对元素,靠前元素赋为靠后元素,靠后元素等于两者原值之和
lp = offset * ( * idx + ) - ;
rp = offset * ( * idx + ) - ;
temp = input[lp];
input[lp] = input[rp];
input[rp] += temp;

● 使用局部内存优化吗,C++ 版本

 // kernel.cl
__kernel void reduce(__global uint4* input, __global uint4* output, int NUM)
__local uint4 resArray[];
unsigned int lid = get_local_id(), globalSize = get_global_size();
int i;
uint4 res = (uint4) { , , , }; for (i = get_global_id(); i < NUM / ; res += input[i], i += globalSize);// 在局部内存中做第一次规约
resArray[lid] = res;
barrier(CLK_LOCAL_MEM_FENCE); for (i = get_local_size() >> ; i > ; i >>= )// 分发
if (lid < i)
resArray[lid] += resArray[lid + i];
} if (lid == )
output[get_group_id()] = resArray[];
 // main.cpp
#include <cl.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream> const char *filename = "kernel.cl"; using namespace std; int isVerify(int NUM, int groupNUM, int *res)// 校验结果
int i, sum;
for (i = sum = ; i < groupNUM * ; sum += res[i++]);
return sum == (NUM + ) * NUM / ;
} char* readSource(const char *filename)// 读取代码文件,C++ 版本
std::fstream f(filename, (std::fstream::in | std::fstream::binary));
if (!f.is_open())
cout << "<readSource> Error open file " << filename << "\n" << endl;
return nullptr;
f.seekg(, std::fstream::end);
size_t size = (size_t)f.tellg();
f.seekg(, std::fstream::beg);
char *str = new char[size + ];
if (str == nullptr)
cout << "<readSource> Error malloc memory\n" << endl;
return nullptr;
f.read(str, size);
str[size] = '\0';
return str;
} int main(int argc, char* argv[])
cl_int status;
cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); const char *source = readSource(filename);
const size_t sourceSize = strlen(source);
cl_program program = clCreateProgramWithSource(context, , &source, &sourceSize, NULL);
clBuildProgram(program, , listDevice, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "reduce", NULL); const int NUM = * ;
size_t global_work_size[] = { };
size_t local_work_size[] = { };
size_t groupNUM = global_work_size[] / local_work_size[];
int* input = new int[NUM];
for (int i = ; i < NUM; input[i] = i + , i++);
int* output = new int[(global_work_size[] / local_work_size[]) * ];
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (NUM) * sizeof(int), (void *)input, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, groupNUM * * sizeof(int), NULL, NULL); clSetKernelArg(kernel, , sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(kernel, , sizeof(cl_mem), (void *)&outputBuffer);
clSetKernelArg(kernel, , sizeof(int), &NUM);
cl_event enentPoint;
clEnqueueNDRangeKernel(queue, kernel, , NULL, global_work_size, local_work_size, , NULL, &enentPoint);
clWaitForEvents(, &enentPoint);
clReleaseEvent(enentPoint); clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, , groupNUM * * sizeof(int), output, , NULL, NULL);
cout << "The result is " << (isVerify(NUM, groupNUM, output) ? "Correct" : "Error") << ".\n" << endl; free(input);
return ;

● 输出结果

The result is correct.

