OpenCL多次循环执行内核的一个简单样例

最近有不少朋友在多次循环执行OpenCL内核程序的时候碰到一些问题。由于对OpenCL初学者而言可能比较普遍，因此我这里给出一个清晰简单的demo来掩饰如何简单又高效地执行循环执行OpenCL内核。

以下程序的大概意思与流程是：

内核程序含有两个参数，第一个参数既是输入又是输出，第二个参数仅仅用于输入。不过第一个参数只对其初始化一次，而第二个参数在每次循环执行新一次的内核程序前会再传递一次数据。这么做有助于同学更好地去理解、把握存储器对象的基本使用方法。

存储器对象在通过cl_context上下文创建完之后，其所在的GPU端的位置就不变了。因此，我们在循环执行内核程序之前不需要把存储器对象释放掉，然后重新分配。这么做就比较低效了。我们完全可以重用同一个存储器对象。

以下代码在我的MacBook Air上能完全通过编译执行。没有任何warning。

执行环境：基于Haswell微架构的Intel Core i7 4650U，Intel HD Graphics 5000，8GB DDR3L，128GB SSD。

OS X 10.9.2 Mavericks，Xcode 5.1，Apple LLVM 5.1，支持GNU11标准的C编译器。

#include <stdio.h>

#include <string.h>

#include <stdlib.h>

#ifdef __APPLE__

#include <OpenCL/opencl.h>

#else

#include <CL/cl.h>

#endif

int main(void)

{

    cl_int ret;

    cl_platform_id platform_id = NULL;

    cl_device_id device_id = NULL;

    cl_context context = NULL;

    cl_command_queue command_queue = NULL;

    cl_mem memObj1 = NULL;

    cl_mem memObj2 = NULL;

    char *kernelSource = NULL;

    cl_program program = NULL;

    cl_kernel kernel = NULL;

    int *pInputBuffer1 = NULL;

    int *pInputBuffer2 = NULL;

    int *pOutputBuffer = NULL;

    clGetPlatformIDs(, &platform_id, NULL);

    if(platform_id == NULL)

    {

        puts("Get OpenCL platform failed!");

        goto FINISH;

    }

    clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, , &device_id, NULL);

    if(device_id == NULL)

    {

        puts("No GPU available as a compute device!");

        goto FINISH;

    }

    context = clCreateContext(NULL, , &device_id, NULL, NULL, &ret);

    if(context == NULL)

    {

        puts("Context not established!");

        goto FINISH;

    }

    command_queue = clCreateCommandQueue(context, device_id, , &ret);

    if(command_queue == NULL)

    {

        puts("Command queue cannot be created!");

        goto FINISH;

    }

    // Specify the path of the kernel source

    const char *pFileName = "/Users/zennychen/Downloads/test.cl";

    FILE *fp = fopen(pFileName, "r");

    if (fp == NULL)

    {

        puts("The specified kernel source file cannot be opened!");

    goto FINISH;

    }

    fseek(fp, , SEEK_END);

    const long kernelLength = ftell(fp);

    fseek(fp, , SEEK_SET);

    kernelSource = malloc(kernelLength);

    fread(kernelSource, , kernelLength, fp);

    fclose(fp);

    program = clCreateProgramWithSource(context, , (const char**)&kernelSource, (const size_t*)&kernelLength, &ret);

    ret = clBuildProgram(program, , &device_id, NULL, NULL, NULL);

    if (ret != CL_SUCCESS)

    {

        size_t len;

        char buffer[ * ];

        printf("Error: Failed to build program executable!\n");

        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);

        printf("%s\n", buffer);

        goto FINISH;

    }

    kernel = clCreateKernel(program, "test", &ret);

    if(kernel == NULL)

    {

        puts("Kernel failed to create!");

        goto FINISH;

    }

    const size_t contentLength = sizeof(*pInputBuffer1) *  * ;

    // 这里预分配的缓存大小为4MB，第一个参数是读写的

    memObj1 = clCreateBuffer(context, CL_MEM_READ_WRITE, contentLength, NULL, &ret);

    if(memObj1 == NULL)

    {

        puts("Memory object1 failed to create!");

        goto FINISH;

    }

    // 这里预分配的缓存大小为4MB，第一个参数是只读的

    memObj2 = clCreateBuffer(context, CL_MEM_READ_ONLY, contentLength, NULL, &ret);

    if(memObj1 == NULL)

    {

        puts("Memory object2 failed to create!");

        goto FINISH;

    }

    ret = clSetKernelArg(kernel, , sizeof(cl_mem), (void *)&memObj1);

    ret |= clSetKernelArg(kernel, , sizeof(cl_mem), (void *)&memObj2);

    if(ret != CL_SUCCESS)

    {

        puts("Set arguments error!");

        goto FINISH;

    }

    // 以下为在主机端分配输入缓存

    pInputBuffer1 = malloc(contentLength);

    pInputBuffer2 = malloc(contentLength);

    // 然后对此工作缓存进行初始化

    for(int i = ; i <  * ; i++)

        pInputBuffer1[i] = i + ;

    memset(pInputBuffer2, , contentLength);

    // 然后分配输出缓存

    pOutputBuffer = malloc(contentLength);

    // 先将第一个参数的数据传入GPU端，以后就不去改动了

    ret = clEnqueueWriteBuffer(command_queue, memObj1, CL_TRUE, , contentLength, pInputBuffer1, , NULL, NULL);

    if(ret != CL_SUCCESS)

    {

        puts("Data transfer failed");

        goto FINISH;

    }

    int count = ;  // 执行5次循环

    do

    {

        // 先将第二个参数传给GPU

        ret = clEnqueueWriteBuffer(command_queue, memObj2, CL_TRUE, , contentLength, pInputBuffer2, , NULL, NULL);

        if(ret != CL_SUCCESS)

        {

            puts("Data transfer failed");

            goto FINISH;

        }

        // 这里指定将总共有1024 * 1024个work-item

        ret = clEnqueueNDRangeKernel(command_queue, kernel, , NULL, (const size_t[]){ * }, NULL, , NULL, NULL);

        // 将结果拷贝给主机端

        ret = clEnqueueReadBuffer(command_queue, memObj1, CL_TRUE, , contentLength, pOutputBuffer, , NULL, NULL);

        // 做次同步，这里偷懒，不用wait event机制了～

        clFinish(command_queue);

        // 做校验

        const int newValue =  - count + ;

        const int addition = ( - count) * newValue / ;

        for(int i = ; i <  * ; i++)

        {

            if(pOutputBuffer[i] != i +  + addition)

            {

                puts("Result error!");

                break;

            }

        }

        // 最后，给第二个缓存初始化新数据

        for(int i = ; i <  * ; i++)

            pInputBuffer2[i] = newValue;

    }

    while(--count > );

FINISH:

    /* Finalization */

    if(pInputBuffer1 != NULL)

        free(pInputBuffer1);

    if(pInputBuffer2 != NULL)

        free(pInputBuffer2);

    if(pOutputBuffer != NULL)

        free(pOutputBuffer);

    if(kernelSource != NULL)

        free(kernelSource);

    if(memObj1 != NULL)

        clReleaseMemObject(memObj1);

    if(memObj2 != NULL)

        clReleaseMemObject(memObj2);

    if(kernel != NULL)

        clReleaseKernel(kernel);

    if(program != NULL)

        clReleaseProgram(program);

    if(command_queue != NULL)

        clReleaseCommandQueue(command_queue);

    if(context != NULL)

        clReleaseContext(context);

    return ;

}

上面OpenCL内核源文件的路径被写死了——“/Users/zennychen/Downloads/test.cl”。各位可以根据自己环境重新指定。

另外，上面用了一些C99语法特性。如果是用Win7的小伙伴们，请使用Visual Studio 2013（Express/Professional）的C编译器。

下面是OpenCL内核源文件：

__kernel void test(__global int *pInOut, __global int *pIn)

{

    int index = get_global_id();

    pInOut[index] += pIn[index];

}

OpenCL多次循环执行内核的一个简单样例的更多相关文章

Spring Ajax一个简单样例
配置不说了.要在前面helloworld的样例基础上弄. 相同在hello下新建ajax.jsp <%@ page language="java" contentType=& ...
VB.net数据库编程（03）：一个SQLserver连接查询的简单样例
这个样例,因为在ADO.net入门已经专门学了,再次进行复习一下. 主要掌握连接字串的情况. 过程就是: 1.引用System.Data.SqlClient.而Access中引用的是System. ...
PHP初学者如何搭建环境，并在本地服务器（or云端服务器）运行自己的第一个PHP样例
页面底部有PHP代码样例供测试使用. 1.PHP开发,你需要什么? 1)开发代码的工具,可以用IDE名字叫做phpDesigner.当然也可以临时用记事本代替,记得文件扩展名为.php 2)服务器(本 ...
hdu1011（树形背包)(提供一个特殊样例)
题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=1011 Starship Troopers Time Limit: 10000/5000 MS (Jav ...
golang 记录函数执行耗时的一个简单方法。
先写一个公共函数, 比如在 common 包下有这么一个方法: // 写超时警告日志通用方法 func TimeoutWarning(tag, detailed string, start time ...
【Xcode学C-1】怎样用Xcode练习C语言，并练习一个输出样例，以及重要的注意事项
直接用Xcode学习C语言,为iOS开发打基础. (1)选择OS X >>> Application >>> Command Line Tool (2)输入产品名称 ...
C# 调用系统API 内核简单样例
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.R ...
eclipse 配置执行hadoop 2.7 程序样例參考步骤
前提:你搭建好了hadoop 2.x的linux环境,并可以成功执行.还有就是window可以訪问到集群.over 1. hfds-site.xml 添加属性:关闭集群的权限校验.windows的用户 ...
C#开发Unity游戏教程循环遍历做出推断及Unity游戏演示样例
C#开发Unity游戏教程循环遍历做出推断及Unity游戏演示样例 Unity中循环遍历每一个数据,并做出推断非常多时候.游戏在玩家做出推断以后.游戏程序会遍历玩家身上大量的所需数据,然后做出推断. ...

随机推荐

C++自问
1.forwarding reference 2.move 3. map的内部实现 rb tree,但rbtree优点是什么?使用情况?和b+有啥区别? 4.顺序容器和关联容器的区别: 本质区别是顺序 ...
js 执行完setTimeout再接着执行函数
var counter = 0; function increase(){ var d = jQuery.Deferred(); var doIncrease = function() { if(co ...
MySQL菜鸟入门“秘籍”
一.MySQL简介 1.什么是数据库 ? 数据库(Database)是按照数据结构来组织.存储和管理数据的仓库,它产生于距今六十多年前,随着信息技术和市场的发展,特别是二十世纪九十年代以后,数据管理不 ...
win10锁屏壁纸文件夹Assets中无文件问题的解决方法
一.前言 win10在锁屏时会有很多精美的壁纸,在网上查找到win10锁屏壁纸存放目录为 : C:\Users\你的用户名\AppData\Local\Packages\Microsoft.Windo ...
mongodb索引复合索引
当我们的查询条件不只有一个时,就需要建立复合索引,比如插入一条{x:1,y:2,z:3}记录,按照我们之前建立的x为1的索引,可是使用x查询,现在想按照x与y的值查询,就需要创建如下的索引创 ...
2018年长沙理工大学第十三届程序设计竞赛 I 连续区间的最大公约数
连续区间的最大公约数思路:参照BZOJ 4488: [Jsoi2015]最大公约数脑补出的一个\(map\)套\(vector\)的写法,写起来比线段树短,运行时间比线段树快. 代码: #pragm ...
做一个函数返回当前日期、当前时间格式为“XXXX年XX月XX日”
import time import datetime def time_strf(now_date):#传入0,1,2返回当前日期.当前时间.当前日期与时间 today=datetime.date ...
PAT1046
题目链接 https://pintia.cn/problem-sets/994805260223102976/problems/994805277847568384 题解题目有几个点需要注意: 甲和 ...
c语言1博客作业05
一.本周作业头这个作业属于那个课程 C语言程序设计II 这个作业要求在哪里 https://edu.cnblogs.com/campus/zswxy/SE2019-3/homework/9831 我 ...
17、生命周期-BeanPostProcessor在Spring底层的使用
17.生命周期-BeanPostProcessor在Spring底层的使用 bean赋值.注入其他组件.@Autowired注解.生命周期注解.@Async --都是 BeanPostProcesso ...

OpenCL多次循环执行内核的一个简单样例

OpenCL多次循环执行内核的一个简单样例的更多相关文章

随机推荐

热门专题