CUDA入门1

1GPUs can handle thousands of concurrent threads.

2The pieces of code running on the gpu are called kernels

3A kernel is executed by a set of threads.

4All threads execute the same code (SPMD)

5Each thread has an index that is used to calculate memory addresses that this will access.

1Threads are grouped into blocks

2 Blocks are grouped into a grid

3 A kernel is executed as a grid of blocks of threads

 Built-in variables ⎯ threadIdx, blockIdx ⎯ blockDim, gridDim

CUDA的线程组织即Grid-Block-Thread结构。一组线程并行处理可以组织为一个block，而一组block并行处理可以组织为一个Grid。下面的程序分别为线程并行和块并行，线程并行为细粒度的并行，而块并行为粗粒度的并行。addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include <stdio.h>

 #include <time.h>

 #include <stdlib.h>

 #define MAX 255

 #define MIN 0

 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float* etime);

 __global__ void addKernelThread(int *c, const int *a, const int *b)

 {

   int i = threadIdx.x;

  c[i] = a[i] + b[i];

 }

 __global__ void addKernelBlock(int *c, const int *a, const int *b)

 {

   int i = blockIdx.x;

  c[i] = a[i] + b[i];

 }

 int main()

 {

     const int arraySize = ;

     int a[arraySize] = { , , , ,  };

    int b[arraySize] = { , , , ,  };

     for (int i = ; i< arraySize ; i++){

         a[i] = rand() % (MAX +  - MIN) + MIN;

         b[i] = rand() % (MAX +  - MIN) + MIN;

     }

     int c[arraySize] = {  };

     // Add vectors in parallel.

     cudaError_t cudaStatus;

     int num = ;

     float time;

     cudaDeviceProp prop;

     cudaStatus = cudaGetDeviceCount(&num);

     for(int i = ;i<num;i++)

     {

         cudaGetDeviceProperties(&prop,i);

     }

     cudaStatus = addWithCuda(c, a, b, arraySize,,&time);

     printf("Elasped time of thread is : %f \n", time);

     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]);

     cudaStatus = addWithCuda(c, a, b, arraySize,,&time);

     printf("Elasped time of block is : %f \n", time);

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "addWithCuda failed!");

         return ;

     }

     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]);

     // cudaThreadExit must be called before exiting in order for profiling and

     // tracing tools such as Nsight and Visual Profiler to show complete traces.

     cudaStatus = cudaThreadExit();

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaThreadExit failed!");

         return ;

     }

     return ;

 }

 // Helper function for using CUDA to add vectors in parallel.

 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float * etime)

 {

     int *dev_a = ;

     int *dev_b = ;

     int *dev_c = ;

     clock_t start, stop;

     float time;

     cudaError_t cudaStatus;

     // Choose which GPU to run on, change this on a multi-GPU system.

     cudaStatus = cudaSetDevice();

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

         goto Error;

     }

     // Allocate GPU buffers for three vectors (two input, one output)    .

     cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     // Copy input vectors from host memory to GPU buffers.

     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     // Launch a kernel on the GPU with one thread for each element.

     if(type == ){

         start = clock();

         addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);

     }

     else{

         start = clock();

         addKernelBlock<<<size, >>>(dev_c, dev_a, dev_b);

     }

     stop = clock();

     time = (float)(stop-start)/CLOCKS_PER_SEC;

     *etime = time;

   // cudaThreadSynchronize waits for the kernel to finish, and returns

     // any errors encountered during the launch.

     cudaStatus = cudaThreadSynchronize();

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

         goto Error;

     }

     // Copy output vector from GPU buffer to host memory.

     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

 Error:

     cudaFree(dev_c);

     cudaFree(dev_a);

     cudaFree(dev_b);

     return cudaStatus;

 }

运行的结果是

Elasped time of thread is : 0.000010
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}
Elasped time of block is : 0.000005
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}

CUDA入门1的更多相关文章

CUDA入门
CUDA入门鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to ...
一篇不错的CUDA入门
鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to Genera ...
CUDA入门需要知道的东西
CUDA刚学习不久,做毕业要用,也没时间研究太多的东西,我的博客里有一些我自己看过的东西,不敢保证都特别有用,但是至少对刚入门的朋友或多或少希望对大家有一点帮助吧,若果你是大牛请指针不对的地方,如果你 ...
Cuda入门笔记
最近在学cuda ,找了好久入门的教程,感觉入门这个教程比较好,网上买的书基本都是在掌握基础后才能看懂,所以在这里记录一下.百度文库下载,所以不知道原作者是谁,向其致敬! 文章目录 1. CUDA是什 ...
CUDA 入门（转）
CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...
CUDA编程->CUDA入门了解（一）
安装好CUDA6.5+VS2012,操作系统为Win8.1版本号,首先下个GPU-Z检測了一下: 看出本显卡属于中低端配置.关键看两个: Shaders=384.也称作SM.或者说core/流处理器数 ...
CUDA中Bank conflict冲突
转自:http://blog.csdn.net/smsmn/article/details/6336060 其实这两天一直不知道什么叫bank conflict冲突,这两天因为要看那个矩阵转置优化的问 ...
【CUDA】CUDA框架介绍
引用出自Bookc的博客,链接在此http://bookc.github.io/2014/05/08/my-summery-the-book-cuda-by-example-an-introduct ...
转：ubuntu 下GPU版的 tensorflow / keras的环境搭建
http://blog.csdn.net/jerr__y/article/details/53695567 前言:本文主要介绍如何在 ubuntu 系统中配置 GPU 版本的 tensorflow 环 ...

随机推荐

怎样用C#代码知道是否已连接网络
有时,上传数据和下载数据都需要用到网络,但是不知道程序是否已连接到网络,下面是简单测试是否已连接网络的小功能 1.在winform窗体上添加一个按钮和多选框
Access数据库的常用数据类型和alter的用法
一.Access比较常用的数据类型:文本.备注.数字.日期/时间.货币意思 Sql Access 1)文本 nvarchar(30) ...
MySQL架构
一．MySQL逻辑架构第一层,即最上一层,所包含的服务并不是MySQL所独有的技术.它们都是服务于C/S程序或者是这些程序所需要的 :连接处理,身份验证,安全性等等. ...
【基础】PHP变量及变量作用域
新学PHP,比较有意思的语法,记录下. 1. 变量的作用域作用域只分两个Global和Local,Global相对于整个.php文件来讲,Local是本地最小范围,是距离变量最近的范围,如:在函数中 ...
同源策略和JSONP（概念性）
同源策略浏览器有一个很重要的概念——同源策略(Same-Origin Policy). 所谓同源是指,域名,协议,端口相同.不同源的客户端脚本(javascript.ActionScript)在没明 ...
wxPython简单入门
wxPython简介 wxPython 是 Python 语言的一套优秀的 GUI 图形库,允许 Python 程序员很方便的创建完整的.功能键全的 GUI 用户界面. wxPython 是作为优秀 ...
iOS 杂笔－20（UIView和CALayer的区别与联系）
iOS 杂笔-20(UIView和CALayer的区别与联系) 每个 UIView 内部都有一个 CALayer 在背后提供内容的绘制和显示,并且 UIView 的尺寸样式都由内部的 Layer 所提 ...
【原】UI随设备旋转从iOS6到iOS8的适配策略
- (void)statusBarOrientationChange:(NSNotification *)notification { WClassAndFunctionName; UIInterfa ...
xib命名注意事项--防止被其他控制器意外地当做默认的 view了
注意: 1.创建的xib如果不是想给指定的控制器做view的话,命名就要注意了! 2.最好是不要命名和控制器名字相关的xib. 如下举例说明一下: - (void)touchesBegan:(NSSe ...
苹果手机不进post方法
今天遇到一个问题,开发的公众号中的一个界面在安卓和微信开发者工具中可以正常显示,在苹果手机中加载不出数据. 以下是部分代码: var start = 0; var limit = 15; var ca ...

CUDA入门1

CUDA入门1的更多相关文章

随机推荐

热门专题