《GPU高性能编程CUDA实战》第七章 纹理内存
▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子。分别使用了一维和二维纹理单元。
● 热传导(使用一维纹理)
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\cpu_anim.h" #define DIM 1024
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f //在全局位置上声明纹理引用,存在于GPU中
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut; struct DataBlock
{
unsigned char *output_bitmap;
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
CPUAnimBitmap *bitmap;
cudaEvent_t start, stop;
float totalTime;
float frames;
}; __global__ void blend_kernel(float *dst, bool dstOut)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; int left = offset - ;//找到上下左右的块
int right = offset + ;
int top = offset - DIM;
int bottom = offset + DIM;
if (x == )
left++;
if (x == DIM - )
right--;
if (y == )
top += DIM;
if (y == DIM - )
bottom -= DIM;
float t, l, c, r, b;
if (dstOut)
{
t = tex1Dfetch(texIn, top);
l = tex1Dfetch(texIn, left);
c = tex1Dfetch(texIn, offset);
r = tex1Dfetch(texIn, right);
b = tex1Dfetch(texIn, bottom);
}
else
{
t = tex1Dfetch(texOut, top);
l = tex1Dfetch(texOut, left);
c = tex1Dfetch(texOut, offset);
r = tex1Dfetch(texOut, right);
b = tex1Dfetch(texOut, bottom);
} dst[offset] = c + SPEED * (t + b + r + l - * c); return;
} __global__ void copy_const_kernel(float *iptr)// 将恒温常量矩阵覆盖输入矩阵
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; float c = tex1Dfetch(texConstSrc, offset);
if (c != )
iptr[offset] = c; return;
} void anim_gpu(DataBlock *d, int ticks)
{
cudaEventRecord(d->start, );
dim3 blocks(DIM / , DIM / );
dim3 threads(, );
CPUAnimBitmap *bitmap = d->bitmap; volatile bool dstOut = true;//确定输入矩阵是哪一个,true代表dev_inSrc,false代表ev_outSrc
for (int i = ; i < ; i++)
{
float *in, *out;
if (dstOut)
{
in = d->dev_inSrc;
out = d->dev_outSrc;
}
else
{
in = d->dev_outSrc;
out = d->dev_inSrc;
} copy_const_kernel << < blocks, threads >> > (in);
blend_kernel << < blocks, threads >> > (out, dstOut);
dstOut = !dstOut;
}
float_to_color << < blocks, threads >> > (d->output_bitmap, d->dev_inSrc); cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost); cudaEventRecord(d->stop, );
cudaEventSynchronize(d->stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %3.1f ms\n", d->totalTime / d->frames);
} void anim_exit(DataBlock *d)// 收拾申请的内存
{
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
cudaFree(d->dev_constSrc); cudaEventDestroy(d->start);
cudaEventDestroy(d->stop);
return;
} int main(void)
{
DataBlock data;
CPUAnimBitmap bitmap(DIM, DIM, &data);
data.bitmap = &bitmap;
data.totalTime = ;
data.frames = ;
cudaEventCreate(&data.start);
cudaEventCreate(&data.stop); int imageSize = bitmap.image_size(); cudaMalloc((void**)&data.output_bitmap, imageSize); cudaMalloc((void**)&data.dev_inSrc, imageSize);
cudaMalloc((void**)&data.dev_outSrc, imageSize);
cudaMalloc((void**)&data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);//将内存绑定到之前声明的纹理引用中去
cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);
cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize); float *temp = (float*)malloc(imageSize);
for (int i = ; i < DIM*DIM; i++)// 恒温格点数据
{
temp[i] = ;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MIN_TEMP;
}
cudaMemcpy(data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice); for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
{
temp[i] = 0.5;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
}
cudaMemcpy(data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice); free(temp); bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit); getchar();
return;
}
● 输出结果(左侧为恒高温,中间为恒低温,右侧为初始高温点)
● 使用一维纹理内存的过程浓缩一下就变成了以下过程
texture<float> texSrc;// 在全局位置上声明纹理引用 float *dev_Src;
cudaMalloc((void**)&dev_Src, sizeof(float)*DIM);// 申请和绑定纹理内存
cudaBindTexture(NULL, texSrc, dev_Src, NULL); float *temp = (float *)malloc(sizeof(float)*DIM);// 初始化该内存中的内容
//Initalize data in temp and then free(temp) cudaMemcpy(dev_Src, temp, sizeof(float)*DIM, cudaMemcpyHostToDevice); //Do something cudaUnbindTexture(texSrc);// 解绑和释放内存
cudaFree(dev_Src);
● 访问纹理内存不用中括号下标,而是
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float c = tex1Dfetch(texSrc, offset);
● 热传导(使用二维纹理),输出结果同一维纹理的的情况,速度上没有明显差别
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\cpu_anim.h" #define DIM 1024
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f texture<float, > texConstSrc;
texture<float, > texIn;
texture<float, > texOut; struct DataBlock
{
unsigned char *output_bitmap;
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
CPUAnimBitmap *bitmap;
cudaEvent_t start, stop;
float totalTime;
float frames;
}; __global__ void blend_kernel(float *dst,bool dstOut)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; float t, l, c, r, b;
if (dstOut)//不需要自己处理边界情况
{
t = tex2D(texIn, x, y - );
l = tex2D(texIn, x - , y);
c = tex2D(texIn, x, y);
r = tex2D(texIn, x + , y);
b = tex2D(texIn, x, y + );
}
else
{
t = tex2D(texOut, x, y - );
l = tex2D(texOut, x - , y);
c = tex2D(texOut, x, y);
r = tex2D(texOut, x + , y);
b = tex2D(texOut, x, y + );
}
dst[offset] = c + SPEED * (t + b + r + l - * c); return;
} __global__ void copy_const_kernel(float *iptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; float c = tex2D(texConstSrc, x, y);
if (c != )
iptr[offset] = c; return;
} void anim_gpu(DataBlock *d, int ticks)
{
cudaEventRecord(d->start, );
dim3 blocks(DIM / , DIM / );
dim3 threads(, );
CPUAnimBitmap *bitmap = d->bitmap; volatile bool dstOut = true;
for (int i = ; i < ; i++)
{
float *in, *out;
if (dstOut) {
in = d->dev_inSrc;
out = d->dev_outSrc;
}
else
{
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel << <blocks, threads >> > (in);
blend_kernel << <blocks, threads >> > (out, dstOut);
dstOut = !dstOut;
}
float_to_color << <blocks, threads >> > (d->output_bitmap, d->dev_inSrc); cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost); cudaEventRecord(d->stop, );
cudaEventSynchronize(d->stop); float elapsedTime;
cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %3.1f ms\n", d->totalTime / d->frames); return;
} void anim_exit(DataBlock *d)
{
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
cudaFree(d->dev_constSrc); cudaEventDestroy(d->start);
cudaEventDestroy(d->stop);
return;
} int main(void)
{
DataBlock data;
CPUAnimBitmap bitmap(DIM, DIM, &data);
data.bitmap = &bitmap;
data.totalTime = ;
data.frames = ;
cudaEventCreate(&data.start);
cudaEventCreate(&data.stop); int imageSize = bitmap.image_size(); cudaMalloc((void**)&data.output_bitmap, imageSize); cudaMalloc((void**)&data.dev_inSrc, imageSize);
cudaMalloc((void**)&data.dev_outSrc, imageSize);
cudaMalloc((void**)&data.dev_constSrc, imageSize); cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaBindTexture2D(NULL, texConstSrc, data.dev_constSrc, desc, DIM, DIM, sizeof(float) * DIM);
cudaBindTexture2D(NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(float) * DIM);
cudaBindTexture2D(NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(float) * DIM); float *temp = (float*)malloc(imageSize);
for (int i = ; i<DIM*DIM; i++) {
temp[i] = ;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MIN_TEMP;
}
cudaMemcpy(data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice); for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
{
temp[i] = 0.5;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
}
cudaMemcpy(data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice);
free(temp); bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit); getchar();
return ;
}
● 使用纹理内存的过程浓缩一下就变成了以下过程
texture<float, > texSrc;// 在全局位置上声明纹理引用 float *dev_Src;
cudaMalloc((void**)&dev_Src, DIM*DIM);// 申请和绑定纹理内存
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaBindTexture2D(NULL, texSrc, dev_Src, desc, DIM, DIM, sizeof(float) * DIM*DIM); float *temp = (float*)malloc(sizeof(float)*DIM*DIM);// 初始化该内存中的内容
//Initalize data in temp and then free(temp) cudaMemcpy(dev_Src, temp, sizeof(float)*DIM*DIM, cudaMemcpyHostToDevice); //Do something cudaUnbindTexture(texSrc);// 解绑和释放内存
cudaFree(dev_Src);
● 访问纹理内存不用中括号下标,而是
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float c = tex2D(texSrc, x, y);
《GPU高性能编程CUDA实战》第七章 纹理内存的更多相关文章
- 《GPU高性能编程CUDA实战》第九章 原子性
▶ 本章介绍了原子操作,给出了基于原子操作的直方图计算的例子. ● 章节代码 #include <stdio.h> #include "cuda_runtime.h" ...
- [问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例“显示器驱动已停止响应,并且已恢复”问题的解决方法
以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...
- 《GPU高性能编程CUDA实战》第十一章 多GPU系统的CUDA C
▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include " ...
- 《GPU高性能编程CUDA实战》第五章 线程并行
▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...
- 《GPU高性能编程CUDA实战》第四章 简单的线程块并行
▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...
- 《GPU高性能编程CUDA实战》第六章 常量内存
▶ 本章介绍了常量内存的使用,并给光线追踪的一个例子.介绍了结构cudaEvent_t及其在计时方面的使用. ● 章节代码,大意是有SPHERES个球分布在原点附近,其球心坐标在每个坐标轴方向上分量绝 ...
- 《GPU高性能编程CUDA实战》第三章 CUDA设备相关
▶ 这章介绍了与CUDA设备相关的参数,并给出了了若干用于查询参数的函数. ● 代码(已合并) #include <stdio.h> #include "cuda_runtime ...
- 《GPU高性能编程CUDA实战中文》中第四章的julia实验
在整个过程中出现了各种问题,我先将我调试好的真个项目打包,提供下载. /* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. ...
- 《GPU高性能编程CUDA实战》附录二 散列表
▶ 使用CPU和GPU分别实现散列表 ● CPU方法 #include <stdio.h> #include <time.h> #include "cuda_runt ...
随机推荐
- dfs、遍历与for
dfs实际上就是若干个递归式连续使用,从而把所有情况全部遍历的方法 首先是递归式的连用,然后注意参数的选取以及变化就行了 1.参数一般有状态参数与开关参数 最简单的dfs就是每次选择只是改变自身状态( ...
- hdu 1203 dp(关于概率的```背包?)
题意:一个人手里有一笔钱 n ,有 m 所大学,分别知道这些大学的投简历花费和被录取概率,因为钱数有限,只能投一部分学校,问被录取的概率最大有多大. 这题除去计算概率以外就是一个 0 1 背包问题,所 ...
- python入门20180717-迭代器、生成器和协程
迭代器.生成器和协程 python中任意的对象,只要它定义了可以返回一个迭代器的__iter__方法,或者支持下标索引的_getitem_方法,那么它就是一个可迭代对象. 可迭代的对象不一定就是迭代器 ...
- ML(3): 贝叶斯方法
对于分类问题,我们每个人每天都在执行分类操作,只是我们没有意识到罢了.例如,当你看到一个陌生人,你的脑子下意识判断TA是男是女:你可能经常会走在路上对身旁的朋友说“这个人一看就很有钱.那边有个非主流” ...
- 【appium】根据id定位元素
目前没有尝试成功,等成功后补充 id=Resource Id,可以通过UIAutomatorViewer获得.如果目标设备的API Level低于18则UIAutomatorViewer不能获得对应的 ...
- IE11 FOR WIN7 32 装的补丁
- java 网络编程UDP
获得主机名 和 ip 的操作 简单示例 发送 接收 发送:键盘录入获得数据 接收:接收端持续接收数据 配合多线程可以完成一个聊天的功能.
- 杂项:Unity3D
ylbtech-杂项:Unity3D Unity3D是由Unity Technologies开发的一个让玩家轻松创建诸如三维视频游戏.建筑可视化.实时三维动画等类型互动内容的多平台的综合型游戏开发工具 ...
- 杂项-自动化测试工具:Selenium(浏览器自动化测试框架)
ylbtech-杂项-自动化测试工具:Selenium(浏览器自动化测试框架) Selenium 是一个用于Web 应用程序测试的工具.Selenium 测试直接运行在浏览器中,就像真正的用户在操作一 ...
- LDA-MySql
http://blog.csdn.net/white_smile/article/details/19565701