cuda中thread id

 ////////////////////////////////////////////////////////////////////////////

 //

 // Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.

 //

 // Please refer to the NVIDIA end user license agreement (EULA) associated

 // with this source code for terms and conditions that govern your use of

 // this software. Any use, reproduction, disclosure, or distribution of

 // this software and related documentation outside the terms of the EULA

 // is strictly prohibited.

 //

 ////////////////////////////////////////////////////////////////////////////

 //

 // This sample illustrates the usage of CUDA events for both GPU timing and

 // overlapping CPU and GPU execution.  Events are inserted into a stream

 // of CUDA calls.  Since CUDA stream calls are asynchronous, the CPU can

 // perform computations while GPU is executing (including DMA memcopies

 // between the host and device).  CPU can query CUDA events to determine

 // whether GPU has completed tasks.

 //

 // includes, system

 #include <stdio.h>

 // includes CUDA Runtime

 #include <cuda_runtime.h>

 // includes, project

 #include <helper_cuda.h>

 #include <helper_functions.h> // helper utility functions 

 __global__ void increment_kernel(int *g_data, int inc_value)

 {

     int idx = blockIdx.x * blockDim.x + threadIdx.x;// thread id 计算分三级:thread, block .grid .

     g_data[idx] = g_data[idx] + inc_value; //每一个线程,把对应的操作数增加一个常数

 }

 bool correct_output(int *data, const int n, const int x)

 {

     for (int i = ; i < n; i++)

         if (data[i] != x)

         {

             printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);

             return false;

         }

     return true;

 }

 int main(int argc, char *argv[])

 {

     int devID;

     cudaDeviceProp deviceProps;

     printf("[%s] - Starting...\n", argv[]);

     // This will pick the best possible CUDA capable device

     devID = findCudaDevice(argc, (const char **)argv);

     // get device name

     checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));

     printf("CUDA device [%s]\n", deviceProps.name);

     int n =  *  * ;

     int nbytes = n * sizeof(int);

     int value = ;

     // allocate host memory

     int *a = ;

     checkCudaErrors(cudaMallocHost((void **)&a, nbytes));

     memset(a, , nbytes);

     // allocate device memory

     int *d_a=;

     checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));

     checkCudaErrors(cudaMemset(d_a, , nbytes));

     // set kernel launch configuration

     dim3 threads = dim3(, );//每个block1024个threads，一维

     dim3 blocks  = dim3(n / threads.x, );//block数量，

     // create cuda event handles

     cudaEvent_t start, stop;//运算计时

     checkCudaErrors(cudaEventCreate(&start));

     checkCudaErrors(cudaEventCreate(&stop));

     StopWatchInterface *timer = NULL;

     sdkCreateTimer(&timer);

     sdkResetTimer(&timer);

     checkCudaErrors(cudaDeviceSynchronize());

     float gpu_time = 0.0f;

     printf("a=%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t\n",a[n--],a[n--],a[n--],a[n--],a[n--],a[n--],a[n--],a[n--],a[n--]);

     // asynchronously issue work to the GPU (all to stream 0)

     sdkStartTimer(&timer);

     cudaEventRecord(start, );

     cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, );//把host中变量a复制到device中的变量d_a

     increment_kernel<<<blocks, threads, , >>>(d_a, value);//device执行

     cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, );//device结果复制到host

     cudaEventRecord(stop, );

     sdkStopTimer(&timer);

     // have CPU do some work while waiting for stage 1 to finish

     unsigned long int counter=;

     while (cudaEventQuery(stop) == cudaErrorNotReady)

     {

         counter++;

     }

     checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));

     // print the cpu and gpu times

     printf("time spent executing by the GPU: %.2f\n", gpu_time);

     printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));

     printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);

     printf("a=%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t\n",a[n--],a[n--],a[n--],a[n--],a[n--],a[n--],a[n--],a[],a[]);

     // check the output for correctness

     bool bFinalResults = correct_output(a, n, value);

     // release resources

     checkCudaErrors(cudaEventDestroy(start));

     checkCudaErrors(cudaEventDestroy(stop));

     checkCudaErrors(cudaFreeHost(a));

     checkCudaErrors(cudaFree(d_a));

     exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);

 }

一个grid包含多个blocks，这些blocks的组织方式可以是一维，二维或者三维。任何一个block包含有多个Threads，这些Threads的组织方式也可以是一维，二维或者三维。举例来讲：比如上图中，任何一个block中有10个Thread，那么，Block(0,0)的第一个Thread的ThreadIdx是0，Block(1,0)的第一个Thread的ThreadIdx是11；Block(2,0)的第一个Thread的ThreadIdx是21，......，依此类推，

cuda中thread id的更多相关文章

CUDA中并行规约（Parallel Reduction）的优化
转自: http://hackecho.com/2013/04/cuda-parallel-reduction/ Parallel Reduction是NVIDIA-CUDA自带的例子,也几乎是所有C ...
CUDA中确定你显卡的thread和block数
CUDA中确定你显卡的thread和block数在进行并行计算时, 你的显卡所支持创建的thread数与block数是有限制的, 因此, 需要自己提前确定够用, 再进行计算, 否则, 你需要改进你的 ...
C#中 Thread，Task，Async/Await，IAsyncResult 的那些事儿！
说起异步,Thread,Task,async/await,IAsyncResult 这些东西肯定是绕不开的,今天就来依次聊聊他们 1.线程(Thread) 多线程的意义在于一个应用程序中,有多个执行部 ...
C#中 Thread，Task，Async/Await，IAsyncResult 的那些事儿！[转载]
说起异步,Thread,Task,async/await,IAsyncResult 这些东西肯定是绕不开的,今天就来依次聊聊他们 1.线程(Thread) 多线程的意义在于一个应用程序中,有多个执行部 ...
thread::id
线程标识符id可以通过thread::get_id()获得,若thread obejct没有和任何线程关联则返回一个NULL的std::thread::id表示没有任何线程.当前线程若想获得自己的id ...
CUDA中使用多维数组
今天想起一个问题,看到的绝大多数CUDA代码都是使用的一维数组,是否可以在CUDA中使用一维数组,这是一个问题,想了各种问题,各种被77的错误状态码和段错误折磨,最后发现有一个cudaMallocMa ...
Android Framework中Thread类
Thread类是Android为线程操作而做的一个封装.代码在Thread.cpp中,其中还封装了一些与线程同步相关的类. Thread类 Thread类的构造函数中的有一个canCallJava T ...
详解C#中 Thread，Task，Async/Await，IAsyncResult的那些事儿
说起异步,Thread,Task,async/await,IAsyncResult 这些东西肯定是绕不开的,今天就来依次聊聊他们 1.线程(Thread) 多线程的意义在于一个应用程序中,有多个执行部 ...
删除数据表中除id外其他字段相同的冗余信息
删除一个信息表中除id外其他字段都相同的冗余信息,如下 id name addr 1 a b 2 a b 3 b c 删除这个表中的冗余信息即应该是 id name addr 1 a b 3 b c ...

随机推荐

jQuery8种不同的瀑布流懒加载loading效果
优化图片加载插件jQuery8种不同的瀑布流懒加载loading效果在线预览下载地址实例代码 <ul class="grid effect-1" id="g ...
（转）高性能JavaScript：加载和运行（动态加载JS代码）
浏览器是如何加载JS的当浏览器遇到一个<script>标签时,浏览器首先根据标签src属性下载JavaScript代码,然后运行JavaScript代码,继而继续解析和翻译页面.如果需要 ...
ArcEngine中合并断开的线要素（根据几何判断）
在上一篇ArcEngine环境下合并断开的线要素(根据属性)随笔中介绍了如何通过shp文件属性表中相同的属性字段进行线要素的合并.今天刚把通过几何条件判断的方式连接断开的线要素的ArcGIS 插件完成 ...
[Android]解决ClickableSpan中点击后ListView中item的长按冲突的问题
以下内容为原创,转载请注明: 来自天天博客:http://www.cnblogs.com/tiantianbyconan/p/3823429.html 项目中碰到一个问题,情景是这样的: 有一个Lis ...
JavaScript学习13 JavaScript中的继承
JavaScript学习13 JavaScript中的继承继承第一种方式:对象冒充 <script type="text/javascript"> //继承第一种方式 ...
JavaBean 基础概念、使用实例及代码分析
JavaBean 基础概念.使用实例及代码分析 JavaBean的概念 JavaBean是一种可重复使用的.且跨平台的软件组件. JavaBean可分为两种:一种是有用户界面的(有UI的):另一种是没 ...
html 常用的标签
1.html 的基本格式 <html> <head> <meta charset="UTF-8"> <title>HTML5的标题& ...
Android studio 如何查看当前git 分支的4种方式
1.第一种 2.第二种 3.第三种 4.第四种前面3种都是通过android studio 操作的. 第四种是通过命令行操作.(可以在 git bash 中输入命 ...
【读书笔记】iOS网络-解析响应负载
Web Service可以通过多种格式返回结构化数据, 不过大多数时候使用的是XML与JSON.也可以让应用只接收HTML结构的数据.实现了这些Web Service或是接收HTML文档的应用必须能解 ...
Unity3D 面试题汇总
最先执行的方法是: 1.(激活时的初始化代码)Awake,2.Start.3.Update[FixUpdate.LateUpdate].4.(渲染模块)OnGUI.5.再向后,就是卸载模块(TearD ...

cuda中thread id

cuda中thread id的更多相关文章

随机推荐

热门专题