CUDA编程札记

const int N = 33 * 1024;

const int threadsPerBlock = 256;

const int blocksPerGrid =

            imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );

__global__ void dot( float *a, float *b, float *c ) {

    __shared__ float cache[threadsPerBlock];

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    int cacheIndex = threadIdx.x;

    float   temp = 0;

    while (tid < N) {

        temp += a[tid] * b[tid];

        tid += blockDim.x * gridDim.x;

    }

    // set the cache values

    cache[cacheIndex] = temp;

    // synchronize threads in this block

    __syncthreads();

    // for reductions, threadsPerBlock must be a power of 2

    // because of the following code

    int i = blockDim.x/2;

    while (i != 0) {

        if (cacheIndex < i)

            cache[cacheIndex] += cache[cacheIndex + i];

        __syncthreads();

        i /= 2;

    }

    if (cacheIndex == 0)

        c[blockIdx.x] = cache[0];

}

int main( void ) {

    float   *a, *b, c, *partial_c;

    float   *dev_a, *dev_b, *dev_partial_c;

    // allocate memory on the cpu side

    a = (float*)malloc( N*sizeof(float) );

    b = (float*)malloc( N*sizeof(float) );

    partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );

    // allocate the memory on the GPU

    HANDLE_ERROR( cudaMalloc( (void**)&dev_a,

                              N*sizeof(float) ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_b,

                              N*sizeof(float) ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,

                              blocksPerGrid*sizeof(float) ) );

    // fill in the host memory with data

    for (int i=0; i<N; i++) {

        a[i] = i;

        b[i] = i*2;

    }

    // copy the arrays 'a' and 'b' to the GPU

    HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),

                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),

                              cudaMemcpyHostToDevice ) ); 

    dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,

                                            dev_partial_c );

    // copy the array 'c' back from the GPU to the CPU

    HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,

                              blocksPerGrid*sizeof(float),

                              cudaMemcpyDeviceToHost ) );

    // finish up on the CPU side

    c = 0;

    for (int i=0; i<blocksPerGrid; i++) {

        c += partial_c[i];

    }

    #define sum_squares(x)  (x*(x+1)*(2*x+1)/6)

    printf( "Does GPU value %.6g = %.6g?\n", c,

             2 * sum_squares( (float)(N - 1) ) );

    // free memory on the gpu side

    HANDLE_ERROR( cudaFree( dev_a ) );

    HANDLE_ERROR( cudaFree( dev_b ) );

    HANDLE_ERROR( cudaFree( dev_partial_c ) );

    // free memory on the cpu side

    free( a );

    free( b );

    free( partial_c );

}

struct Lock {

    int *mutex;

    Lock( void ) {

        HANDLE_ERROR( cudaMalloc( (void**)&mutex,sizeof(int) ) );

        HANDLE_ERROR( cudaMemset( mutex, 0, sizeof(int) ) );

    }

    ~Lock( void ) {

        cudaFree( mutex );

    }

    __device__ void lock( void ) {

        while( atomicCAS( mutex, 0, 1 ) != 0 );

    }

    __device__ void unlock( void ) {

        atomicExch( mutex, 0 );

    }

};

#define imin(a,b) (a<b?a:b)

const int N = 33 * 1024 * 1024;

const int threadsPerBlock = 256;

const int blocksPerGrid =

            imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );

__global__ void dot( Lock lock, float *a,

                     float *b, float *c ) {

    __shared__ float cache[threadsPerBlock];

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    int cacheIndex = threadIdx.x;

    float   temp = 0;

    while (tid < N) {

        temp += a[tid] * b[tid];

        tid += blockDim.x * gridDim.x;

    }

    // set the cache values

    cache[cacheIndex] = temp;

    // synchronize threads in this block

    __syncthreads();

    // for reductions, threadsPerBlock must be a power of 2

    // because of the following code

    int i = blockDim.x/2;

    while (i != 0) {

        if (cacheIndex < i)

            cache[cacheIndex] += cache[cacheIndex + i];

        __syncthreads();

        i /= 2;

    }

    if (cacheIndex == 0) {

        // wait until we get the lock

        lock.lock();

       // we have the lock at this point, update and release

        *c += cache[0];

        lock.unlock();

    }

}

int main( void ) {

    float   *a, *b, c = 0;

    float   *dev_a, *dev_b, *dev_c;

    // allocate memory on the cpu side

    a = (float*)malloc( N*sizeof(float) );

    b = (float*)malloc( N*sizeof(float) );

    // allocate the memory on the GPU

    HANDLE_ERROR( cudaMalloc( (void**)&dev_a,

                              N*sizeof(float) ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_b,

                              N*sizeof(float) ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_c,

                              sizeof(float) ) );

    // fill in the host memory with data

    for (int i=0; i<N; i++) {

        a[i] = i;

        b[i] = i*2;

    }

    // copy the arrays 'a' and 'b' to the GPU

    HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),

                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),

                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMemcpy( dev_c, &c, sizeof(float),

                              cudaMemcpyHostToDevice ) ); 

    Lock    lock;

    dot<<<blocksPerGrid,threadsPerBlock>>>( lock, dev_a,

                                            dev_b, dev_c );

    // copy c back from the GPU to the CPU

    HANDLE_ERROR( cudaMemcpy( &c, dev_c,

                              sizeof(float),

                              cudaMemcpyDeviceToHost ) );

    #define sum_squares(x)  (x*(x+1)*(2*x+1)/6)

    printf( "Does GPU value %.6g = %.6g?\n", c,

             2 * sum_squares( (float)(N - 1) ) );

    // free memory on the gpu side

    HANDLE_ERROR( cudaFree( dev_a ) );

    HANDLE_ERROR( cudaFree( dev_b ) );

    HANDLE_ERROR( cudaFree( dev_c ) );

    // free memory on the cpu side

    free( a );

    free( b );

}

__global__ void histo_kernel( unsigned char *buffer,

                              long size,

                              unsigned int *histo ) {

    // calculate the starting index and the offset to the next

    // block that each thread will be processing

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    int stride = blockDim.x * gridDim.x;

    while (i < size) {

        atomicAdd( &histo[buffer[i]], 1 );

        i += stride;

    }

}

int main( void ) {

    unsigned char *buffer =

                     (unsigned char*)big_random_block( SIZE );

    // capture the start time

    // starting the timer here so that we include the cost of

    // all of the operations on the GPU.

    cudaEvent_t     start, stop;

    HANDLE_ERROR( cudaEventCreate( &start ) );

    HANDLE_ERROR( cudaEventCreate( &stop ) );

    HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // allocate memory on the GPU for the file's data

    unsigned char *dev_buffer;

    unsigned int *dev_histo;

    HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );

    HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,

                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,

                              256 * sizeof( int ) ) );

    HANDLE_ERROR( cudaMemset( dev_histo, 0,

                              256 * sizeof( int ) ) );

    // kernel launch - 2x the number of mps gave best timing

    cudaDeviceProp  prop;

    HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );

    int blocks = prop.multiProcessorCount;

    histo_kernel<<<blocks*2,256>>>( dev_buffer, SIZE, dev_histo );

    unsigned int    histo[256];

    HANDLE_ERROR( cudaMemcpy( histo, dev_histo,

                              256 * sizeof( int ),

                              cudaMemcpyDeviceToHost ) );

    // get stop time, and display the timing results

    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );

    HANDLE_ERROR( cudaEventSynchronize( stop ) );

    float   elapsedTime;

    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,

                                        start, stop ) );

    printf( "Time to generate:  %3.1f ms\n", elapsedTime );

    long histoCount = 0;

    for (int i=0; i<256; i++) {

        histoCount += histo[i];

    }

    printf( "Histogram Sum:  %ld\n", histoCount );

    // verify that we have the same counts via CPU

    for (int i=0; i<SIZE; i++)

        histo[buffer[i]]--;

    for (int i=0; i<256; i++) {

        if (histo[i] != 0)

            printf( "Failure at %d!  Off by %d\n", i, histo[i] );

    }

    HANDLE_ERROR( cudaEventDestroy( start ) );

    HANDLE_ERROR( cudaEventDestroy( stop ) );

    cudaFree( dev_histo );

    cudaFree( dev_buffer );

    free( buffer );

    return 0;

}

__global__ void histo_kernel( unsigned char *buffer,

                              long size,

                              unsigned int *histo ) {

    // clear out the accumulation buffer called temp

    // since we are launched with 256 threads, it is easy

    // to clear that memory with one write per thread

    __shared__  unsigned int temp[256];

    temp[threadIdx.x] = 0;

    __syncthreads();

    // calculate the starting index and the offset to the next

    // block that each thread will be processing

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    int stride = blockDim.x * gridDim.x;

    while (i < size) {

        atomicAdd( &temp[buffer[i]], 1 );

        i += stride;

    }

    // sync the data from the above writes to shared memory

    // then add the shared memory values to the values from

    // the other thread blocks using global memory

    // atomic adds

    // same as before, since we have 256 threads, updating the

    // global histogram is just one write per thread!

    __syncthreads();

    atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );

}

int main( void ) {

    unsigned char *buffer =

                     (unsigned char*)big_random_block( SIZE );

    // capture the start time

    // starting the timer here so that we include the cost of

    // all of the operations on the GPU.  if the data were

    // already on the GPU and we just timed the kernel

    // the timing would drop from 74 ms to 15 ms.  Very fast.

    cudaEvent_t     start, stop;

    HANDLE_ERROR( cudaEventCreate( &start ) );

    HANDLE_ERROR( cudaEventCreate( &stop ) );

    HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // allocate memory on the GPU for the file's data

    unsigned char *dev_buffer;

    unsigned int *dev_histo;

    HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );

    HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,

                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,

                              256 * sizeof( int ) ) );

    HANDLE_ERROR( cudaMemset( dev_histo, 0,

                              256 * sizeof( int ) ) );

    // kernel launch - 2x the number of mps gave best timing

    cudaDeviceProp  prop;

    HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );

    int blocks = prop.multiProcessorCount;

    histo_kernel<<<blocks*2,256>>>( dev_buffer,

                                    SIZE, dev_histo );

    unsigned int    histo[256];

    HANDLE_ERROR( cudaMemcpy( histo, dev_histo,

                              256 * sizeof( int ),

                              cudaMemcpyDeviceToHost ) );

    // get stop time, and display the timing results

    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );

    HANDLE_ERROR( cudaEventSynchronize( stop ) );

    float   elapsedTime;

    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,

                                        start, stop ) );

    printf( "Time to generate:  %3.1f ms\n", elapsedTime );

    long histoCount = 0;

    for (int i=0; i<256; i++) {

        histoCount += histo[i];

    }

    printf( "Histogram Sum:  %ld\n", histoCount );

    // verify that we have the same counts via CPU

    for (int i=0; i<SIZE; i++)

        histo[buffer[i]]--;

    for (int i=0; i<256; i++) {

        if (histo[i] != 0)

            printf( "Failure at %d!\n", i );

    }

    HANDLE_ERROR( cudaEventDestroy( start ) );

    HANDLE_ERROR( cudaEventDestroy( stop ) );

    cudaFree( dev_histo );

    cudaFree( dev_buffer );

    free( buffer );

    return 0;

}

注：本文是作者对GPU高性能编程CUDA实战的学习总结。此书的代码可以在下面的链接下载，无需积分哦！

http://download.csdn.net/detail/celerychen2009/6360573

CUDA编程札记的更多相关文章

不同版本CUDA编程的问题
1 无法装上CUDA的toolkit 卸载所有的NVIDIA相关的app,包括NVIDIA的显卡驱动,然后重装. 2之前的文件打不开,one or more projects in the solut ...
cuda编程基础
转自: http://blog.csdn.net/augusdi/article/details/12529247 CUDA编程模型 CUDA编程模型将CPU作为主机,GPU作为协处理器(co-pro ...
CUDA学习笔记（一）——CUDA编程模型
转自:http://blog.sina.com.cn/s/blog_48b9e1f90100fm56.html CUDA的代码分成两部分,一部分在host(CPU)上运行,是普通的C代码:另一部分在d ...
CUDA编程
目录: 1.什么是CUDA 2.为什么要用到CUDA 3.CUDA环境搭建 4.第一个CUDA程序 5. CUDA编程 5.1. 基本概念 5.2. 线程层次结构 5.3. 存储器层次结构 5.4. ...
CUDA编程－（1）Tesla服务器Kepler架构和万年的HelloWorld
结合CUDA范例精解以及CUDA并行编程.由于正在学习CUDA,CUDA用的比较多,因此翻译一些个人认为重点的章节和句子,作为学习,程序将通过NVIDIA K40服务器得出结果.如果想通过本书进行CU ...
cuda编程（一）
环境安装和例程运行显卡主要有两家,ATI.NVIDIA,简称A卡和N卡.随着GPU计算能力的上升,采用GPU并行计算来加速的应用越来越多. Nvidia创立人之一,黄仁勋(Jen-Hsun Huan ...
CUDA编程入门，Dim3变量
dim3是NVIDIA的CUDA编程中一种自定义的整型向量类型,基于用于指定维度的uint3. 例如:dim3 grid(num1,num2,num3): dim3类型最终设置的是一个三维向量,三维参 ...
CUDA编程（六）进一步并行
CUDA编程(六) 进一步并行在之前我们使用Thread完毕了简单的并行加速,尽管我们的程序运行速度有了50甚至上百倍的提升,可是依据内存带宽来评估的话我们的程序还远远不够.在上一篇博客中给大家介绍 ...
CUDA编程模型之内存管理
CUDA编程模型假设系统是由一个主机和一个设备组成的,而且各自拥有独立的内存. 主机:CPU及其内存(主机内存),主机内存中的变量名以h_为前缀,主机代码按照ANSI C标准进行编写设备:GPU及其 ...

随机推荐

Maven的私有仓库Nexus
1.什么是Nexus 在前面进行maven项目的构建中,可以看到在构建的过程中需要安装maven的依赖插件,如图: 在日常的开发构建中,我们也可以自己搭建一个私有的nexus.那么什么是nexus呢? ...
使用Gradle管理第三方依赖
http://blog.bsdn.org/2012/01/02/%E4%BD%BF%E7%94%A8gradle%E7%AE%A1%E7%90%86%E7%AC%AC%E4%B8%89%E6%96%B ...
一个Apache安装多个版本的PHP
我的服务器centos6.5安装了xampp,php6.5版本的.已经有好几个网站在上面运行了,但是后面要安装该死的ecshop,无奈要装php5.2,因此就想如何能在一个apache上安装多个版本的 ...
Spring Security静态资源访问
在使用Spring Security时要求所有请求都需要授权访问,此时会定义过滤规则如下 protected void configure(HttpSecurity http) throws Exce ...
TensorFlow-GPU安装配置（win10+tensorflow1.6+CUDA9.0+cudnn7.0+python3.6+Visual Studio2013）
安装步骤: TensorFlow官网 tensorflow一般只能装在python3上,CUDA9.0搭配cudnn7.0,CUDA8.0搭配cudnn6.0 查看对应要安装的环境版本(因为会不断更新 ...
Xamarin中Unsupported major.minor version 52.0问题解决
Xamarin中Unsupported major.minor version 52.0问题解决出现这种问题,是由于所使用的Java代码使用Java 8所才具有的特性.这个时候,需要将JDK升级到J ...
JZYZOJ 1360 [usaco2011feb]人品问题 DP 树状数组离散化
http://172.20.6.3/Problem_Show.asp?id=1360 好想好写代码 #include<iostream> #include<cstdio&g ...
light oj 1151 - Snakes and Ladders 高斯消元+概率DP
思路: 在没有梯子与蛇的时候很容易想到如下公式: dp[i]=1+(∑dp[i+j])/6 但是现在有梯子和蛇也是一样的,初始化p[i]=i; 当有梯子或蛇时转移为p[a]=b; 这样方程变为: dp ...
Problem D: 零起点学算法83——数组中删数
#include<stdio.h> int main(void) { int n,i,t,x,flag; while(scanf("%d",&n)!=EOF) ...
[bzoj1011](HNOI2008)遥远的行星(近似运算)
Description 直线上N颗行星,X=i处有行星i,行星J受到行星I的作用力,当且仅当i<=AJ.此时J受到作用力的大小为 Fi->j=Mi*Mj/(j-i) 其中A为很小的常量, ...

CUDA编程札记

CUDA编程札记的更多相关文章

随机推荐

热门专题