《GPU高性能编程CUDA实战》第七章 纹理内存
▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子。分别使用了一维和二维纹理单元。
● 热传导(使用一维纹理)
- #include <stdio.h>
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include "D:\Code\CUDA\book\common\book.h"
- #include "D:\Code\CUDA\book\common\cpu_anim.h"
- #define DIM 1024
- #define PI 3.1415926535897932f
- #define MAX_TEMP 1.0f
- #define MIN_TEMP 0.0001f
- #define SPEED 0.25f
- //在全局位置上声明纹理引用,存在于GPU中
- texture<float> texConstSrc;
- texture<float> texIn;
- texture<float> texOut;
- struct DataBlock
- {
- unsigned char *output_bitmap;
- float *dev_inSrc;
- float *dev_outSrc;
- float *dev_constSrc;
- CPUAnimBitmap *bitmap;
- cudaEvent_t start, stop;
- float totalTime;
- float frames;
- };
- __global__ void blend_kernel(float *dst, bool dstOut)
- {
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- int offset = x + y * blockDim.x * gridDim.x;
- int left = offset - ;//找到上下左右的块
- int right = offset + ;
- int top = offset - DIM;
- int bottom = offset + DIM;
- if (x == )
- left++;
- if (x == DIM - )
- right--;
- if (y == )
- top += DIM;
- if (y == DIM - )
- bottom -= DIM;
- float t, l, c, r, b;
- if (dstOut)
- {
- t = tex1Dfetch(texIn, top);
- l = tex1Dfetch(texIn, left);
- c = tex1Dfetch(texIn, offset);
- r = tex1Dfetch(texIn, right);
- b = tex1Dfetch(texIn, bottom);
- }
- else
- {
- t = tex1Dfetch(texOut, top);
- l = tex1Dfetch(texOut, left);
- c = tex1Dfetch(texOut, offset);
- r = tex1Dfetch(texOut, right);
- b = tex1Dfetch(texOut, bottom);
- }
- dst[offset] = c + SPEED * (t + b + r + l - * c);
- return;
- }
- __global__ void copy_const_kernel(float *iptr)// 将恒温常量矩阵覆盖输入矩阵
- {
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- int offset = x + y * blockDim.x * gridDim.x;
- float c = tex1Dfetch(texConstSrc, offset);
- if (c != )
- iptr[offset] = c;
- return;
- }
- void anim_gpu(DataBlock *d, int ticks)
- {
- cudaEventRecord(d->start, );
- dim3 blocks(DIM / , DIM / );
- dim3 threads(, );
- CPUAnimBitmap *bitmap = d->bitmap;
- volatile bool dstOut = true;//确定输入矩阵是哪一个,true代表dev_inSrc,false代表ev_outSrc
- for (int i = ; i < ; i++)
- {
- float *in, *out;
- if (dstOut)
- {
- in = d->dev_inSrc;
- out = d->dev_outSrc;
- }
- else
- {
- in = d->dev_outSrc;
- out = d->dev_inSrc;
- }
- copy_const_kernel << < blocks, threads >> > (in);
- blend_kernel << < blocks, threads >> > (out, dstOut);
- dstOut = !dstOut;
- }
- float_to_color << < blocks, threads >> > (d->output_bitmap, d->dev_inSrc);
- cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost);
- cudaEventRecord(d->stop, );
- cudaEventSynchronize(d->stop);
- float elapsedTime;
- cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
- d->totalTime += elapsedTime;
- ++d->frames;
- printf("Average Time per frame: %3.1f ms\n", d->totalTime / d->frames);
- }
- void anim_exit(DataBlock *d)// 收拾申请的内存
- {
- cudaUnbindTexture(texIn);
- cudaUnbindTexture(texOut);
- cudaUnbindTexture(texConstSrc);
- cudaFree(d->dev_inSrc);
- cudaFree(d->dev_outSrc);
- cudaFree(d->dev_constSrc);
- cudaEventDestroy(d->start);
- cudaEventDestroy(d->stop);
- return;
- }
- int main(void)
- {
- DataBlock data;
- CPUAnimBitmap bitmap(DIM, DIM, &data);
- data.bitmap = &bitmap;
- data.totalTime = ;
- data.frames = ;
- cudaEventCreate(&data.start);
- cudaEventCreate(&data.stop);
- int imageSize = bitmap.image_size();
- cudaMalloc((void**)&data.output_bitmap, imageSize);
- cudaMalloc((void**)&data.dev_inSrc, imageSize);
- cudaMalloc((void**)&data.dev_outSrc, imageSize);
- cudaMalloc((void**)&data.dev_constSrc, imageSize);
- cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);//将内存绑定到之前声明的纹理引用中去
- cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);
- cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize);
- float *temp = (float*)malloc(imageSize);
- for (int i = ; i < DIM*DIM; i++)// 恒温格点数据
- {
- temp[i] = ;
- int x = i % DIM;
- int y = i / DIM;
- if ((x >= ) && (x < ) && (y >= ) && (y < ))
- temp[i] = MAX_TEMP;
- if ((x >= ) && (x < ) && (y >= ) && (y < ))
- temp[i] = MIN_TEMP;
- }
- cudaMemcpy(data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice);
- for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
- {
- temp[i] = 0.5;
- int x = i % DIM;
- int y = i / DIM;
- if ((x >= ) && (x < ) && (y >= ) && (y < ))
- temp[i] = MAX_TEMP;
- }
- cudaMemcpy(data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice);
- free(temp);
- bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit);
- getchar();
- return;
- }
● 输出结果(左侧为恒高温,中间为恒低温,右侧为初始高温点)
● 使用一维纹理内存的过程浓缩一下就变成了以下过程
- texture<float> texSrc;// 在全局位置上声明纹理引用
- float *dev_Src;
- cudaMalloc((void**)&dev_Src, sizeof(float)*DIM);// 申请和绑定纹理内存
- cudaBindTexture(NULL, texSrc, dev_Src, NULL);
- float *temp = (float *)malloc(sizeof(float)*DIM);// 初始化该内存中的内容
- //Initalize data in temp and then free(temp)
- cudaMemcpy(dev_Src, temp, sizeof(float)*DIM, cudaMemcpyHostToDevice);
- //Do something
- cudaUnbindTexture(texSrc);// 解绑和释放内存
- cudaFree(dev_Src);
● 访问纹理内存不用中括号下标,而是
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- int offset = x + y * blockDim.x * gridDim.x;
- float c = tex1Dfetch(texSrc, offset);
● 热传导(使用二维纹理),输出结果同一维纹理的的情况,速度上没有明显差别
- #include <stdio.h>
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include "D:\Code\CUDA\book\common\book.h"
- #include "D:\Code\CUDA\book\common\cpu_anim.h"
- #define DIM 1024
- #define PI 3.1415926535897932f
- #define MAX_TEMP 1.0f
- #define MIN_TEMP 0.0001f
- #define SPEED 0.25f
- texture<float, > texConstSrc;
- texture<float, > texIn;
- texture<float, > texOut;
- struct DataBlock
- {
- unsigned char *output_bitmap;
- float *dev_inSrc;
- float *dev_outSrc;
- float *dev_constSrc;
- CPUAnimBitmap *bitmap;
- cudaEvent_t start, stop;
- float totalTime;
- float frames;
- };
- __global__ void blend_kernel(float *dst,bool dstOut)
- {
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- int offset = x + y * blockDim.x * gridDim.x;
- float t, l, c, r, b;
- if (dstOut)//不需要自己处理边界情况
- {
- t = tex2D(texIn, x, y - );
- l = tex2D(texIn, x - , y);
- c = tex2D(texIn, x, y);
- r = tex2D(texIn, x + , y);
- b = tex2D(texIn, x, y + );
- }
- else
- {
- t = tex2D(texOut, x, y - );
- l = tex2D(texOut, x - , y);
- c = tex2D(texOut, x, y);
- r = tex2D(texOut, x + , y);
- b = tex2D(texOut, x, y + );
- }
- dst[offset] = c + SPEED * (t + b + r + l - * c);
- return;
- }
- __global__ void copy_const_kernel(float *iptr)
- {
- // map from threadIdx/BlockIdx to pixel position
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- int offset = x + y * blockDim.x * gridDim.x;
- float c = tex2D(texConstSrc, x, y);
- if (c != )
- iptr[offset] = c;
- return;
- }
- void anim_gpu(DataBlock *d, int ticks)
- {
- cudaEventRecord(d->start, );
- dim3 blocks(DIM / , DIM / );
- dim3 threads(, );
- CPUAnimBitmap *bitmap = d->bitmap;
- volatile bool dstOut = true;
- for (int i = ; i < ; i++)
- {
- float *in, *out;
- if (dstOut) {
- in = d->dev_inSrc;
- out = d->dev_outSrc;
- }
- else
- {
- out = d->dev_inSrc;
- in = d->dev_outSrc;
- }
- copy_const_kernel << <blocks, threads >> > (in);
- blend_kernel << <blocks, threads >> > (out, dstOut);
- dstOut = !dstOut;
- }
- float_to_color << <blocks, threads >> > (d->output_bitmap, d->dev_inSrc);
- cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost);
- cudaEventRecord(d->stop, );
- cudaEventSynchronize(d->stop);
- float elapsedTime;
- cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
- d->totalTime += elapsedTime;
- ++d->frames;
- printf("Average Time per frame: %3.1f ms\n", d->totalTime / d->frames);
- return;
- }
- void anim_exit(DataBlock *d)
- {
- cudaUnbindTexture(texIn);
- cudaUnbindTexture(texOut);
- cudaUnbindTexture(texConstSrc);
- cudaFree(d->dev_inSrc);
- cudaFree(d->dev_outSrc);
- cudaFree(d->dev_constSrc);
- cudaEventDestroy(d->start);
- cudaEventDestroy(d->stop);
- return;
- }
- int main(void)
- {
- DataBlock data;
- CPUAnimBitmap bitmap(DIM, DIM, &data);
- data.bitmap = &bitmap;
- data.totalTime = ;
- data.frames = ;
- cudaEventCreate(&data.start);
- cudaEventCreate(&data.stop);
- int imageSize = bitmap.image_size();
- cudaMalloc((void**)&data.output_bitmap, imageSize);
- cudaMalloc((void**)&data.dev_inSrc, imageSize);
- cudaMalloc((void**)&data.dev_outSrc, imageSize);
- cudaMalloc((void**)&data.dev_constSrc, imageSize);
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
- cudaBindTexture2D(NULL, texConstSrc, data.dev_constSrc, desc, DIM, DIM, sizeof(float) * DIM);
- cudaBindTexture2D(NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(float) * DIM);
- cudaBindTexture2D(NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(float) * DIM);
- float *temp = (float*)malloc(imageSize);
- for (int i = ; i<DIM*DIM; i++) {
- temp[i] = ;
- int x = i % DIM;
- int y = i / DIM;
- if ((x >= ) && (x < ) && (y >= ) && (y < ))
- temp[i] = MAX_TEMP;
- if ((x >= ) && (x < ) && (y >= ) && (y < ))
- temp[i] = MIN_TEMP;
- }
- cudaMemcpy(data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice);
- for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
- {
- temp[i] = 0.5;
- int x = i % DIM;
- int y = i / DIM;
- if ((x >= ) && (x < ) && (y >= ) && (y < ))
- temp[i] = MAX_TEMP;
- }
- cudaMemcpy(data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice);
- free(temp);
- bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit);
- getchar();
- return ;
- }
● 使用纹理内存的过程浓缩一下就变成了以下过程
- texture<float, > texSrc;// 在全局位置上声明纹理引用
- float *dev_Src;
- cudaMalloc((void**)&dev_Src, DIM*DIM);// 申请和绑定纹理内存
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
- cudaBindTexture2D(NULL, texSrc, dev_Src, desc, DIM, DIM, sizeof(float) * DIM*DIM);
- float *temp = (float*)malloc(sizeof(float)*DIM*DIM);// 初始化该内存中的内容
- //Initalize data in temp and then free(temp)
- cudaMemcpy(dev_Src, temp, sizeof(float)*DIM*DIM, cudaMemcpyHostToDevice);
- //Do something
- cudaUnbindTexture(texSrc);// 解绑和释放内存
- cudaFree(dev_Src);
● 访问纹理内存不用中括号下标,而是
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- float c = tex2D(texSrc, x, y);
《GPU高性能编程CUDA实战》第七章 纹理内存的更多相关文章
- 《GPU高性能编程CUDA实战》第九章 原子性
▶ 本章介绍了原子操作,给出了基于原子操作的直方图计算的例子. ● 章节代码 #include <stdio.h> #include "cuda_runtime.h" ...
- [问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例“显示器驱动已停止响应,并且已恢复”问题的解决方法
以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...
- 《GPU高性能编程CUDA实战》第十一章 多GPU系统的CUDA C
▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include " ...
- 《GPU高性能编程CUDA实战》第五章 线程并行
▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...
- 《GPU高性能编程CUDA实战》第四章 简单的线程块并行
▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...
- 《GPU高性能编程CUDA实战》第六章 常量内存
▶ 本章介绍了常量内存的使用,并给光线追踪的一个例子.介绍了结构cudaEvent_t及其在计时方面的使用. ● 章节代码,大意是有SPHERES个球分布在原点附近,其球心坐标在每个坐标轴方向上分量绝 ...
- 《GPU高性能编程CUDA实战》第三章 CUDA设备相关
▶ 这章介绍了与CUDA设备相关的参数,并给出了了若干用于查询参数的函数. ● 代码(已合并) #include <stdio.h> #include "cuda_runtime ...
- 《GPU高性能编程CUDA实战中文》中第四章的julia实验
在整个过程中出现了各种问题,我先将我调试好的真个项目打包,提供下载. /* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. ...
- 《GPU高性能编程CUDA实战》附录二 散列表
▶ 使用CPU和GPU分别实现散列表 ● CPU方法 #include <stdio.h> #include <time.h> #include "cuda_runt ...
随机推荐
- 使用 WPF 开发一个 Windows 屏幕保护程序
最近有小伙伴问我如何可以让 Windows 静置一段时间不操作之后,显示一个特殊的界面.我想了想,屏幕保护程序可以做到这一点,而且,屏幕保护程序的开发也是非常简单的. 本文将介绍如何为 Windows ...
- 实习第一周第一天:接口 extends是继承类,implement是实现接口,原接口里面的方法填充,方法名也是不变,重写override是父类的方法名不变,把方法体给改了
一.定义 Java接口(Interface),是一系列方法的声明,是一些方法特征的集合,一个接口只有方法的特征没有方法的实现,因此这些方法可以在不同的地方被不同的类实现,而这些实现可以具有不同的行为( ...
- web开发的一些总结
现在我们是在互联网的时代,到处可以使用internet 这些年的发展,让we 成为了当前开发的主流,包括现在好多的移动端开发, 很多也是使用web 页面进行呈现,因为web 拉近了你我之间的距离.对于 ...
- tomcat下安装jenkins
参考网址:http://www.cnblogs.com/edward2013/p/5269465.html 5.安装Jenkins 方法1: jenkins.war下载地址: http://mir ...
- SQL SERVER 2008 彻底卸载干净方法 (转)
最近安装SQL SERVER 2008失败后,再重新安装时老是报错,东搞西搞的很难卸干净.但又不方便重装系统,经按下面方法终于搞定并成功安装上2008 1.停掉SQL SERVER 2008所有相关服 ...
- POJ2127 LICS模板
题目:http://poj.org/problem?id=2127 十分费劲地终于记录好了路径……用一个前驱. 这是 n^2 的LICS方法.其实就是 n ^ 2 log n 把“找之前的d [ j ...
- WPF中控制窗口显示位置的三种方式
首先新建一个WPF工程,在主界面添加一个按钮,并给按钮添加点击事件button1_Click,然后新建一个用于测试弹出位置的窗口TestWindow.1.在屏幕中间显示,设置window.Window ...
- JMeter和JMeterPlugin的下载安装
JMeter和JMeterPlugin的下载安装 Apache Jmeter是一个100%的纯Java桌面应用,主要是针对web的压力和性能测试,但后来扩展到其他测试领域.Jmeter可以用于测试FT ...
- hadoop技术入门学习之发行版选择
经常会看到这样的问题:零基础学习hadoop难不难?有的人回答说:零基础学习hadoop,没有想象的那么难,也没有想象的那么容易.看到这样的答案不免觉得有些尴尬,这个问题算是白问了,因为这个回答似乎什 ...
- 【jmeter】jmeter之-聚合点
集合点:简单来理解一下,虽然我们的“性能测试”理解为“多用户并发测试”,但真正的并发是不存在的,为了更真实的实现并发这感念,我们可以在需要压力的地方设置集合点, 还拿那个用户和密码的地方,每到输入用户 ...