1GPUs can handle thousands of concurrent threads.

2The pieces of code running on the gpu are called kernels

3A kernel is executed by a set of threads.

4All threads execute the same code (SPMD)

5Each thread has an index that is used to calculate memory addresses that this will access.

1Threads are grouped into blocks

2 Blocks are grouped into a grid

3 A kernel is executed as a grid of blocks of threads

 Built-in variables ⎯ threadIdx, blockIdx ⎯ blockDim, gridDim

CUDA的线程组织即Grid-Block-Thread结构。一组线程并行处理可以组织为一个block,而一组block并行处理可以组织为一个Grid。下面的程序分别为线程并行和块并行,线程并行为细粒度的并行,而块并行为粗粒度的并行。addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);

 #include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h> #define MAX 255
#define MIN 0
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float* etime);
__global__ void addKernelThread(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void addKernelBlock(int *c, const int *a, const int *b)
{
int i = blockIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = ; int a[arraySize] = { , , , , };
int b[arraySize] = { , , , , }; for (int i = ; i< arraySize ; i++){
a[i] = rand() % (MAX + - MIN) + MIN;
b[i] = rand() % (MAX + - MIN) + MIN;
}
int c[arraySize] = { };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = ; float time;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for(int i = ;i<num;i++)
{
cudaGetDeviceProperties(&prop,i);
} cudaStatus = addWithCuda(c, a, b, arraySize,,&time); printf("Elasped time of thread is : %f \n", time);
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]); cudaStatus = addWithCuda(c, a, b, arraySize,,&time); printf("Elasped time of block is : %f \n", time); if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "addWithCuda failed!");
return ;
}
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]);
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadExit failed!");
return ;
}
return ;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float * etime)
{
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
clock_t start, stop;
float time;
cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
} // Launch a kernel on the GPU with one thread for each element.
if(type == ){
start = clock();
addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);
}
else{
start = clock();
addKernelBlock<<<size, >>>(dev_c, dev_a, dev_b);
} stop = clock();
time = (float)(stop-start)/CLOCKS_PER_SEC;
*etime = time;
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}

运行的结果是

Elasped time of thread is : 0.000010
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}
Elasped time of block is : 0.000005
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}

CUDA入门1的更多相关文章

  1. CUDA入门

    CUDA入门 鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to ...

  2. 一篇不错的CUDA入门

    鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to Genera ...

  3. CUDA入门需要知道的东西

    CUDA刚学习不久,做毕业要用,也没时间研究太多的东西,我的博客里有一些我自己看过的东西,不敢保证都特别有用,但是至少对刚入门的朋友或多或少希望对大家有一点帮助吧,若果你是大牛请指针不对的地方,如果你 ...

  4. Cuda入门笔记

    最近在学cuda ,找了好久入门的教程,感觉入门这个教程比较好,网上买的书基本都是在掌握基础后才能看懂,所以在这里记录一下.百度文库下载,所以不知道原作者是谁,向其致敬! 文章目录 1. CUDA是什 ...

  5. CUDA 入门(转)

    CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...

  6. CUDA编程-&gt;CUDA入门了解(一)

    安装好CUDA6.5+VS2012,操作系统为Win8.1版本号,首先下个GPU-Z检測了一下: 看出本显卡属于中低端配置.关键看两个: Shaders=384.也称作SM.或者说core/流处理器数 ...

  7. CUDA中Bank conflict冲突

    转自:http://blog.csdn.net/smsmn/article/details/6336060 其实这两天一直不知道什么叫bank conflict冲突,这两天因为要看那个矩阵转置优化的问 ...

  8. 【CUDA】CUDA框架介绍

    引用 出自Bookc的博客,链接在此http://bookc.github.io/2014/05/08/my-summery-the-book-cuda-by-example-an-introduct ...

  9. 转:ubuntu 下GPU版的 tensorflow / keras的环境搭建

    http://blog.csdn.net/jerr__y/article/details/53695567 前言:本文主要介绍如何在 ubuntu 系统中配置 GPU 版本的 tensorflow 环 ...

随机推荐

  1. 自制奇葩vb面试题,看你能对几道

    这些题都比较奇葩,所以做出选择之前请仔细考虑. 答题过程中不要离开当前页面,不要去试代码,也不要查参考或问别人. 转载请说明作者是 Nukepayload2 Vb版本:14 默认的.net frame ...

  2. MVC中视图View向控制器传值的方法

    MVC中视图View向控制器传值的方法步骤如下: 1.index页面: 页面中只需要一个触发事件的按钮

  3. Vs2012出现停止工作问题的解决方法

    我的VS2012总是出现问题,打开项目会,更改移动控件位置也会,后来在网上找到了解决方法 这是出现问题

  4. sql 执行时间

    SET STATISTICS PROFILE ON SET STATISTICS IO ON SET STATISTICS TIME ON --GO /*--你的SQL脚本开始*/SELECT * F ...

  5. Oracle 中 call 和 exec的区别

    今天发现了一个小东西,觉得很有意思,查找了一些资料,跟大家分享一下: 在sqlplus中: 在第三方提供的工具(如:plsqldev) 总结: exec是sqlplus的命令,只能在sqlplus中使 ...

  6. LinQ实战学习笔记(四) LINQ to Object, 常用查询操作符

    这一篇介绍了下面的内容: 查询object数组 查询强类型数组 查询泛型字典 查询字符串 SelectMany 索引 Distinct操作符 排序 嵌套查询 分组 组连接 内连接 左外连接 交叉连接 ...

  7. Linux编辑器vim键盘详解

    下面的这张图,一看就明白了,从此,学习变的不再艰难! 补注:图中没有关于查找和替换的,应该用下面的.自上而下的查找操作                  /word小写的n和N自下而上的查找操作    ...

  8. js的alert和confirm美化

    --前言-- window对象的alert和confirm标准方法在不同浏览器的显示效果不太相同,有个相同点是都不是很美观.我们的想法是使用js和css分别仿照它们,提供另一套函数,使在不同浏览器的有 ...

  9. iScroll-js—“smooth scrolling for the web”

    原文地址: http://bigdots.github.io/2015/12/15/iScroll-js%E2%80%94%E2%80%94smooth%20scrolling%20for%20the ...

  10. HTML 5 中的标准属性

    HTML 全局属性 HTML 属性赋予元素意义和语境. 下面的全局属性可用于任何 HTML 元素. (5)= HTML5 中添加的属性. 属性 描述 accesskey 规定激活元素的快捷键. cla ...