《GPU高性能编程CUDA实战》第八章 图形互操作性
▶ OpenGL与DirectX,等待填坑。
● basic_interop
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\cpu_bitmap.h" PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL; #define DIM 512 GLuint bufferObj;
cudaGraphicsResource *resource; // based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png
__global__ void kernel(uchar4 *ptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x / (float)DIM - 0.5f;
float fy = y / (float)DIM - 0.5f;
unsigned char green = + * sin(abs(fx * ) - abs(fy * ); // accessing uchar4 vs unsigned char*
ptr[offset].x = ;
ptr[offset].y = green;
ptr[offset].z = ;
ptr[offset].w = ;
} static void key_func(unsigned char key, int x, int y)
{
switch (key)
{
case :
// clean up OpenGL and CUDA
cudaGraphicsUnregisterResource(resource);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, );
glDeleteBuffers(, &bufferObj);
exit();
}
} static void draw_func(void)
{
// we pass zero as the last parameter, because out bufferObj is now
// the source, and the field switches from being a pointer to a
// bitmap to now mean an offset into a bitmap object
glDrawPixels(DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, );
glutSwapBuffers();
} int main(int argc, char **argv)
{
cudaDeviceProp prop;
int dev; memset(&prop, , sizeof(cudaDeviceProp));
prop.major = ;
prop.minor = ;
cudaChooseDevice(&dev, &prop); // tell CUDA which dev we will be using for graphic interop
// from the programming guide: Interoperability with OpenGL
// requires that the CUDA device be specified by
// cudaGLSetGLDevice() before any other runtime calls. cudaGLSetGLDevice(dev); // these GLUT calls need to be made before the other OpenGL
// calls, else we get a seg fault
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
glutInitWindowSize(DIM, DIM);
glutCreateWindow("bitmap");//初始化并创建一个窗口 //创建缓冲区
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData"); // the first three are standard OpenGL, the 4th is the CUDA reg
// of the bitmap these calls exist starting in OpenGL 1.5
glGenBuffers(, &bufferObj);// 将bufferObj注册为图形资源
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * , NULL, GL_DYNAMIC_DRAW_ARB); cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone); // do work with the memory dst being on the GPU, gotten via mapping
cudaGraphicsMapResources(, &resource, NULL);
uchar4* devPtr;
size_t size;
cudaGraphicsResourceGetMappedPointer((void**)&devPtr,&size,resource); dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(devPtr);
cudaGraphicsUnmapResources(, &resource, NULL); // set up GLUT and kick off main loop
glutKeyboardFunc(key_func);
glutDisplayFunc(draw_func);
glutMainLoop(); getchar();
return;
}
● ripple
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024 __global__ void kernel(uchar4 *ptr, int ticks)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x - DIM / ;
float fy = y - DIM / ;
float d = sqrtf(fx * fx + fy * fy);
unsigned char grey = (unsigned char)(128.0f + 127.0f *cos(d / 10.0f - ticks / 7.0f) / (d / 10.0f + 1.0f));
ptr[offset].x = grey;
ptr[offset].y = grey;
ptr[offset].z = grey;
ptr[offset].w = ;
} void generate_frame(uchar4 *pixels, void*, int ticks)
{
dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(pixels, ticks);
} int main(void)
{
GPUAnimBitmap bitmap(DIM, DIM, NULL); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))generate_frame, NULL); getchar();
return;
}
● heat
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f // these exist on the GPU side
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut; // this kernel takes in a 2-d array of floats
// it updates the value-of-interest by a scaled value based
// on itself and its nearest neighbors
__global__ void blend_kernel(float *dst,bool dstOut)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; int left = offset - ;
int right = offset + ;
if (x == ) left++;
if (x == DIM - ) right--; int top = offset - DIM;
int bottom = offset + DIM;
if (y == ) top += DIM;
if (y == DIM - ) bottom -= DIM; float t, l, c, r, b;
if (dstOut) {
t = tex1Dfetch(texIn, top);
l = tex1Dfetch(texIn, left);
c = tex1Dfetch(texIn, offset);
r = tex1Dfetch(texIn, right);
b = tex1Dfetch(texIn, bottom); }
else {
t = tex1Dfetch(texOut, top);
l = tex1Dfetch(texOut, left);
c = tex1Dfetch(texOut, offset);
r = tex1Dfetch(texOut, right);
b = tex1Dfetch(texOut, bottom);
}
dst[offset] = c + SPEED * (t + b + r + l - * c);
} // NOTE - texOffsetConstSrc could either be passed as a
// parameter to this function, or passed in __constant__ memory
// if we declared it as a global above, it would be
// a parameter here:
// __global__ void copy_const_kernel( float *iptr,
// size_t texOffset )
__global__ void copy_const_kernel(float *iptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; float c = tex1Dfetch(texConstSrc, offset);
if (c != )
iptr[offset] = c;
} // globals needed by the update routine
struct DataBlock
{
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
cudaEvent_t start, stop;
float totalTime;
float frames;
}; void anim_gpu(uchar4* outputBitmap, DataBlock *d, int ticks)
{
cudaEventRecord(d->start, );
dim3 blocks(DIM / , DIM / );
dim3 threads(, ); // since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i = ; i<; i++)
{
float *in, *out;
if (dstOut)
{
in = d->dev_inSrc;
out = d->dev_outSrc;
}
else
{
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel << <blocks, threads >> >(in);
blend_kernel << <blocks, threads >> >(out, dstOut);
dstOut = !dstOut;
}
float_to_color << <blocks, threads >> >(outputBitmap,d->dev_inSrc); cudaEventRecord(d->stop, );
cudaEventSynchronize(d->stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,d->start, d->stop);
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %3.1f ms\n",d->totalTime / d->frames);
} // clean up memory allocated on the GPU
void anim_exit(DataBlock *d)
{
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
cudaFree(d->dev_constSrc);
cudaEventDestroy(d->start);
cudaEventDestroy(d->stop);
} int main(void)
{
DataBlock data;
GPUAnimBitmap bitmap(DIM, DIM, &data);
data.totalTime = ;
data.frames = ;
cudaEventCreate(&data.start);
cudaEventCreate(&data.stop); int imageSize = bitmap.image_size(); cudaMalloc((void**)&data.dev_inSrc, imageSize);
cudaMalloc((void**)&data.dev_outSrc, imageSize);
cudaMalloc((void**)&data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);
cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize); float *temp = (float*)malloc(imageSize);
for (int i = ; i < DIM*DIM; i++)// 恒温格点数据
{
temp[i] = ;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MIN_TEMP;
}
cudaMemcpy(data.dev_constSrc, temp,imageSize,cudaMemcpyHostToDevice); for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
{
temp[i] = 0.5;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
}
cudaMemcpy(data.dev_inSrc, temp,imageSize,cudaMemcpyHostToDevice);
free(temp); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))anim_gpu,(void(*)(void*))anim_exit);
getchar();
return;
}
《GPU高性能编程CUDA实战》第八章 图形互操作性的更多相关文章
- [问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例“显示器驱动已停止响应,并且已恢复”问题的解决方法
以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...
- 《GPU高性能编程CUDA实战》第四章 简单的线程块并行
▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...
- 《GPU高性能编程CUDA实战》第十一章 多GPU系统的CUDA C
▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include " ...
- 《GPU高性能编程CUDA实战》第六章 常量内存
▶ 本章介绍了常量内存的使用,并给光线追踪的一个例子.介绍了结构cudaEvent_t及其在计时方面的使用. ● 章节代码,大意是有SPHERES个球分布在原点附近,其球心坐标在每个坐标轴方向上分量绝 ...
- 《GPU高性能编程CUDA实战》第五章 线程并行
▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...
- 《GPU高性能编程CUDA实战》附录二 散列表
▶ 使用CPU和GPU分别实现散列表 ● CPU方法 #include <stdio.h> #include <time.h> #include "cuda_runt ...
- 《GPU高性能编程CUDA实战》第七章 纹理内存
▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子.分别使用了一维和二维纹理单元. ● 热传导(使用一维纹理) #include <stdio.h> #include "c ...
- 《GPU高性能编程CUDA实战》第三章 CUDA设备相关
▶ 这章介绍了与CUDA设备相关的参数,并给出了了若干用于查询参数的函数. ● 代码(已合并) #include <stdio.h> #include "cuda_runtime ...
- 《GPU高性能编程CUDA实战》附录四 其他头文件
▶ cpu_bitmap.h #ifndef __CPU_BITMAP_H__ #define __CPU_BITMAP_H__ #include "gl_helper.h" st ...
随机推荐
- HDU 1263:水果(map)
题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=1263 #include <stdio.h> #include <string.h&g ...
- (2)socket的基础使用(基于TCP协议)
socket()模块函数用法 基于TCP协议的套接字程序 netstart -an | findstr 8080 #查看所有TCP和UDP协议的状态,用findstr进行过滤监听8080端口 服务端套 ...
- CTF之LSB信息隐藏术
LSB也就是最低有效位,原理是图片中的像素一般是由三种颜色构成,即三原色(绿红蓝),由这三种颜色可以组成其它各种颜色. 例如在PNG图片的储存中,每个颜色会有8bit,LSB隐写就是修改了像素中的最低 ...
- 一篇文章入门Jmeter性能测试【经典长文】
孟船长 目录 1.性能测试定义2.为什么要做性能测试3.性能测试指标.性能测试分类4.Jmeter性能测试实战[入门级]5.参考文章链接 1.性能测试定义 百度&知乎 性能测试是通过自动化的 ...
- JS页面打印预览功能
点击按钮后就可以直接打印预览,并且隐藏了按钮,如果你不希望按钮显示,可以把那部分代码删除就可以了. <html><head><meta http-equiv=" ...
- 实现JMS规范的ActiveMQ
ActiveMQ是Apache软件基金会的开源产品,支持AMQP协议.MQTT协议(和XMPP协议作用类似).Openwire协议和Stomp协议等多种消息协议.并且ActiveMQ完整支持JMS A ...
- RAC6——crsctl 和 srvctl 命令的区别理解
Oracle Clusterware提供了丰富的工具,我们把常用的工具按照rac的四层机构做了划分,然后分别介绍: 节点层: olsnodes 参数-n -i -p 网络层: oifcfg 四个子命令 ...
- webpack 遇到报错情况及解决
webpack2 报错:optimize.OccurenceOrderPlugin is not a function. 原因:上个版本拼写错误,少写了一个字母r,新版本修正过来了.要写成这样:Occ ...
- SQL的datetime类型数据转换为字符串格式大全
Select CONVERT(varchar(100), GETDATE(), 0): 05 16 2006 10:57AM Select CONVERT(varchar(100), GETDATE( ...
- 好强大的vim配置文件
原文链接 http://www.cnblogs.com/ma6174/archive/2011/12/10/2283393.html