GPU Tips
<1> Basic
#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define NUM 15
__global__ void square(float *dout,float *din)
{
int idx = threadIdx.x;
float f = din[idx];
dout[idx] = f*f;
} int main(int argc,char **argv)
{ const int bytes = sizeof(float) * NUM;
float host_in[NUM];
// save some value
for(int i=;i<NUM;i++)
{
host_in[i] = float(i);
} float host_out[NUM]; cudaError_t cudaStatus;
// GPU SETTINGS
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return;
} // define gpu memory, GPU memory allocation
float *device_in = ;
float *device_out = ;
cudaStatus = cudaMalloc((void**)&device_in, bytes);
cudaStatus = cudaMalloc((void**)&device_out,bytes); cudaStatus = cudaMemcpy(device_in,host_in,bytes,cudaMemcpyHostToDevice); // GPU kernel
// 1 block,Num threads
square<<<,NUM>>>(device_out,device_in); cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
} cudaStatus = cudaMemcpy(host_out, device_out, bytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
} // Free GPU memory
cudaFree(device_in);
cudaFree(device_out); for(int i=;i<NUM;i++)
{
fprintf(stdout,"%f \n",host_out[i]);
} getchar(); return ; }
<2> N blocks and block's threads one dim
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <stdlib.h>
#define ARRAYSize 50000000
#define THREADS_PER_BLOCK 1024 #define fnvalue(a,size)\
{\
for(int i=;i<size;i++) \
{\
a[i] = float(i);\
}\
}\ #define CHECK_CUDA_STATUS(STATUS)\
{\
if (STATUS != cudaSuccess)\
{\
fprintf(stdout,"Error in line %d\n ",__LINE__);\
}\
}\ __global__ void add(float *d_out,float *d_x, float *d_y)
{ int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index<ARRAYSize)
{
d_out[index] = d_x[index] + d_y[index];
} } int main(int argc,char **argv)
{ const int bytes = sizeof(float)*ARRAYSize; // host memory
float *h_x = (float*)malloc(bytes);
float *h_y = (float*)malloc(bytes);
float *h_out = (float*)malloc(bytes); // give host value
fnvalue(h_x,ARRAYSize);
fnvalue(h_y,ARRAYSize); // device memory
float *d_x,*d_y,*d_out;
// cuda setttings
cudaError_t dstat;
dstat = cudaSetDevice();
CHECK_CUDA_STATUS(dstat);
dstat = cudaMalloc((void**)&d_x, bytes);
CHECK_CUDA_STATUS(dstat);
dstat = cudaMalloc((void**)&d_y, bytes);
CHECK_CUDA_STATUS(dstat);
dstat = cudaMalloc((void**)&d_out, bytes);
CHECK_CUDA_STATUS(dstat); fprintf(stdout,"Copy data go GPU\n");
cudaMemcpy(d_x,h_x,bytes,cudaMemcpyHostToDevice);
cudaMemcpy(d_y,h_y,bytes,cudaMemcpyHostToDevice); add<<<ARRAYSize/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_out,d_x,d_y); fprintf(stdout,"Copy GPU data to cpu\n");
dstat = cudaMemcpy(h_out,d_out,bytes,cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); // DEBUG SOME VALUE for(int i=;i<;i++)
{
if ((i+)%==)
{
fprintf(stdout,"%f\n", h_out[i]);
}
else
{
fprintf(stdout,"%f ", h_out[i]);
}
} getchar(); // FREE CPU MEMORY
free(h_x);
free(h_y);
free(h_out); // FREE GPU MEMORY
dstat = cudaFree(d_x);
CHECK_CUDA_STATUS(dstat);
dstat = cudaFree(d_y);
CHECK_CUDA_STATUS(dstat);
dstat = cudaFree(d_out);
CHECK_CUDA_STATUS(dstat); return ; }
<3> Unified memory:
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
for (int i = ; i < n; i++)
y[i] = x[i] + y[i];
} int main(void)
{
int N = <<;
float *x, *y; // Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float)); // initialize x and y arrays on the host
for (int i = ; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
} // Run kernel on 1M elements on the GPU
add<<<, >>>(N, x, y); // Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); // Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = ; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl; // Free memory
cudaFree(x);
cudaFree(y); return ;
}
<4>Some tips
(1)
下图表示一维的block是由grid生成的。
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
(2) 关于SharedMemory ,其实是在一个block上的共享memory
code:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <device_functions.h> #define RADIUS 3
#define BLOCKSIZE 10 __global__ void process(int *d_out,int *d_in,int *shared_mem)
{
__shared__ int temp[BLOCKSIZE + * RADIUS ];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
//printf("%d ",lindex);
// Read input elements into shared memory
temp[lindex] = d_in[gindex]; if (threadIdx.x < RADIUS)
{
temp[lindex - RADIUS] = d_in[gindex - RADIUS];
temp[lindex + BLOCKSIZE] = d_in[gindex + BLOCKSIZE]; } shared_mem[lindex] = lindex;
// this code for debug __syncthreads(); // Apply the stencil
int result = ;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
{
result += temp[lindex + offset]; } // Store the result
d_out[gindex] = result; } int main(int argc,char**argv)
{
// allocation of memory int host_rawSize = ; int host_bytes = sizeof(int) * host_rawSize;
int shared_bytes = (host_rawSize+*RADIUS) * sizeof(int); int *host_data = (int*)malloc(host_bytes);
int *host_outData = (int*)malloc(host_bytes);
int *host_sharedMemData = (int*)malloc(shared_bytes);
for(int i=;i<host_rawSize;i++)
{
host_data[i] = int(i)+;
}
for(int i=;i<host_rawSize;i++)
{
fprintf(stdout,"%d ",host_data[i]);
}
fprintf(stdout,"\n"); int *dev_in;
cudaMallocManaged((void**)&dev_in , host_bytes);
//cudaMallocManaged(&dev_in , host_bytes);
//cudaMalloc((void**)&dev_rawdata,bytes);
cudaMemcpy(dev_in,host_data,host_bytes,cudaMemcpyHostToDevice); int dev_out_bytes = host_rawSize *sizeof(int); // 4*sizeof(float)
int *dev_out;
int *dev_shared;
cudaMallocManaged(&dev_out , dev_out_bytes);
cudaMallocManaged(&dev_shared , shared_bytes); process<<<,host_rawSize>>>(dev_out,dev_in,dev_shared); cudaMemcpy(host_outData, dev_out, dev_out_bytes,cudaMemcpyDeviceToHost);
cudaMemcpy(host_sharedMemData,dev_shared,shared_bytes,cudaMemcpyDeviceToHost); printf("===============Debug the gpu shared memory=======================\n");
for(int i=;i<host_rawSize + *RADIUS;i++)
{
fprintf(stdout,"%d ",host_sharedMemData[i]);
}
printf("\n===============Debug the gpu shared memory=======================\n"); for(int i=;i<host_rawSize;i++)
{
fprintf(stdout,"%d ",host_outData[i]);
}
fprintf(stdout,"\n"); getchar(); return ;
}
<1>simple caculation:
I = (R+G+B)/3
I = R*0.299f + G*0.587f + 0.114f*B
CPU:
// Serial implementation for running on CPU using a single thread.
void rgbaToGreyscaleCpu(const uchar4* const rgbaImage, unsigned char *const greyImage,
const size_t numRows, const size_t numCols)
{
for (size_t r = ; r < numRows; ++r) {
for (size_t c = ; c < numCols; ++c) {
const uchar4 rgba = rgbaImage[r * numCols + c];
const float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
greyImage[r * numCols + c] = channelSum;
}
}
}
GPU:
// CUDA kernel which is run in parallel by many GPU threads.
__global__
void rgbaToGreyscaleCudaKernel(const uchar4* const rgbaImage,
unsigned char* const greyImage,
const int numRows, const int numCols)
{
//First create a mapping from the 2D block and grid locations
//to an absolute 2D location in the image, then use that to
//calculate a 1D offset
const long pointIndex = threadIdx.x + blockDim.x*blockIdx.x; if(pointIndex<numRows*numCols) { // this is necessary only if too many threads are started
uchar4 const imagePoint = rgbaImage[pointIndex];
greyImage[pointIndex] = .299f*imagePoint.x + .587f*imagePoint.y + .114f*imagePoint.z;
}
} // Parallel implementation for running on GPU using multiple threads.
void rgbaToGreyscaleCuda(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, const size_t numRows, const size_t numCols)
{
const int blockThreadSize = ;
const int numberOfBlocks = + ((numRows*numCols - ) / blockThreadSize); // a/b rounded up
const dim3 blockSize(blockThreadSize, , );
const dim3 gridSize(numberOfBlocks , , );
rgbaToGreyscaleCudaKernel<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
}
GPU Tips的更多相关文章
- Optimizing graphics performance
看U3D文档,心得:对于3D场景,使用分层次的距离裁剪,小物件分到一个层,稍远时就被裁掉,大物体分到一个层,距离很远时才裁掉,甚至不载.中物体介于二者之间. 文档如下: Good performanc ...
- 玩转渗透神器Kali:Kali Linux作为主系统使用的正确姿势TIPS
Kali Linux 前身是著名渗透测试系统BackTrack ,是一个基于 Debian 的 Linux 发行版,包含很多安全和取证方面的相关工具. 本文假设你在新装好的kali linux环境下… ...
- shader程序员需要注意的优化Tips
在写shader的时候,其实一些写法对于其执行影响非常大,而且由于gpu和cpu在架构上的不同,代码的优化思想也不一样,最近一直在写几个shader,为了性能问题,查阅了很多资料,把一些tips总结下 ...
- Ubuntu16 编译源码安装MXNet 可变卷积Deformable-ConvNets GPU版
[引言]最近接手了公司的关于虫子识别的项目,使用MXNet框架开发,但是实际用的是Deformable-ConvNets. Deformable-ConvNets为微软研究研究院提出的可变卷积网络,可 ...
- 动画性能优化-requestAnimationFrame、GPU等
最近在做一个场景动画,有一个欢迎界面和一个主动画界面,两个界面之间的连接通过一个进度条来完成,当进度条完成,提供通往主动画的按钮. 画面会从一个个的场景移动过去,用户可通过点击抽奖.查看气泡商铺等进行 ...
- ubuntu16.04 Detectron目标检测库配置(包含GPU驱动,Cuda,Caffee2等配置梳理)
Detectron概述 Detectron是Facebook FAIR开源了的一个目标检测(Object Detection)平台. 用一幅图简单说明下Object Detection.如Mask R ...
- Tensorflow、Pytorch、Keras的多GPU使用
Tensorflow.Pytorch.Keras的多GPU的并行操作 方法一 :使用深度学习工具提供的 API指定 1.1 Tesorflow tensroflow指定GPU的多卡并行的时候,也是可以 ...
- Adreno GPU Profiler工具使用总结
Adreno Profiler介绍 Adreno Profiler 是高通公司开发的一款针对运行在高通骁龙处理器上用于图形和GPGPU技术应用的性能分析和帧调试工具.工具本质上是一个OpenGL ES ...
- Generating Complex Procedural Terrains Using GPU
前言:感慨于居然不用tesselation也可以产生这么复杂的地形,当然致命的那个关于不能有洞的缺陷还是没有办法,但是这个赶脚生成的已经足够好了,再加上其它模型估 计效果还是比较震撼的.总之好文共分享 ...
随机推荐
- Prometheus-自定义Node_Exporter
标量(Scalar):一个浮点型的数字值 标量只有一个数字,没有时序. 需要注意的是,当使用表达式count(http_requests_total),返回的数据类型,依然是瞬时向量.用户可以通过内置 ...
- java io系列26之 RandomAccessFile
本文主要介绍 RandomAccessFile. 转载请注明出处:http://www.cnblogs.com/skywang12345/p/io_26.html 更多内容请参考:java io系列0 ...
- 第04篇 JDK版本导致Unsupported major.minor version 52.0 error
一直以来,想改变一些自己早已经习惯的事情. 感谢吕希德同学,让我又解决了一个问题! 出现问题原因-->>分析 { JDK版本不一致的问题 } 在eclipse中开发的项目有个Java bu ...
- Expressions versus statements in JavaScript
Statements and expressions An expression produces a value and can be written wherever a value is exp ...
- html body 100%
html body 100% html <div class="header"> </div> <div class="main" ...
- Maccms8.x 命令执行漏洞分析
下载链接https://share.weiyun.com/23802397ed25681ad45c112bf34cc6db 首先打开Index.php $m = be('get','m'); m参数获 ...
- es6 javascript对象方法Object.assign() 对象的合并复制等
Object.assign方法用于对象的合并,将源对象( source )的所有可枚举属性,复制到目标对象( target ). 详细使用稳步到前辈: http://blog.csdn.net/qq_ ...
- Devexpress dll搜集
Devexpress一部分在全局dll中,需要分析缺哪些dll,有两种方式1.打包,安装时会自动提示 2.使用自带分析工具Assembly deployment tool
- ImageView获取宽高
在Android里放置一个ImageView im1,宽和高都是200.以下代码都是直接在OnCreate里使用. 1.在Android OnCreate里如果直接使用iv.GetWidth()返回值 ...
- PC机Win10声音问题两例处理办法
1.PC电脑接HDMI显示器后无声的解决方案点击声音->播放,下面有一个是显示器,一个扬声器,选择扬声器即可.2.低音太重解决办法扬声器属性,增强,禁用所有声音效果.