0_Simple__asyncAPI

▶ CPU - GPU 异步操作

▶ 源代码

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_cuda.h>

 #include <helper_functions.h>

 __global__ void increment_kernel(int *g_data, int inc_value)

 {

     int idx = blockIdx.x * blockDim.x + threadIdx.x;

     g_data[idx] = g_data[idx] + inc_value;

 }

 bool correct_output(int *data, const int n, const int x)

 {

     for (int i = ; i < n; i++)

     {

         if (data[i] != x)

         {

             printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);

             return false;

         }

     }

     return true;

 }

 int main(int argc, char *argv[])

 {

     printf("Start.\n");

     int devID = findCudaDevice(argc, (const char **)argv);  // 通过命令行参数选择设备，可以为空

     cudaDeviceProp deviceProps;

     cudaGetDeviceProperties(&deviceProps, devID);

     printf("CUDA device [%s]\n", deviceProps.name);

     const int n =  *  * ;

     const int nbytes = n * sizeof(int);

     const int value = ;

     int *a, *d_a;

     cudaMallocHost((void **)&a, nbytes);

     cudaMalloc((void **)&d_a, nbytes);

     memset(a, , nbytes);

     cudaMemset(d_a, , nbytes);

     cudaEvent_t start, stop;                // GPU 端计时器

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     StopWatchInterface *timer = NULL;       // CPU 端计时器

     sdkCreateTimer(&timer);

     sdkResetTimer(&timer);

     dim3 threads = dim3(, , );

     dim3 blocks = dim3(n / threads.x, , );

     sdkStartTimer(&timer);                  // 注意 GPU 计时器是夹在 CPU 计时器内的，但是 GPU 函数都是异步的

     cudaEventRecord(start, );

     cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, );

     increment_kernel << <blocks, threads, ,  >> > (d_a, value);

     cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, );

     cudaEventRecord(stop, );

     sdkStopTimer(&timer);

     unsigned long int counter = ;          // 记录 GPU 运行完成以前 CPU 运行了多少次 while 的循环

     while (cudaEventQuery(stop) == cudaErrorNotReady)

         counter++;

     float gpu_time = 0.0f;                  // 此时保证 GPU 运行完成，才能记录时间

     cudaEventElapsedTime(&gpu_time, start, stop);

     printf("time spent by GPU: %.2f\n", gpu_time);

     printf("time spent by CPU: %.2f\n", sdkGetTimerValue(&timer));

     printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);

     printf("\n\tFinish: %s.", correct_output(a, n, value) ? "Pass" : "Fail");

     cudaEventDestroy(start);

     cudaEventDestroy(stop);

     cudaFreeHost(a);

     cudaFree(d_a);

     getchar();

     return ;

 }

● 输出结果：

GPU Device : "GeForce GTX 1070" with compute capability 6.1

CUDA device [GeForce GTX ]

time spent by GPU: 11.50

time spent by CPU: 0.05

CPU executed  iterations while waiting for GPU to finish

        Finish!

▶ 新姿势：

● 调用主函数时的第0个参数作为程序名字符串，可以用于输出。

 int main(int argc, char *argv[])

 ...

 printf("%s", argv[]);

● 在没有附加 flag 的情况下申请主机内存，注意使用cudaFreeHost释放

 int *a, nbytes = n * sizeof(int);

 cudaMallocHost((void **)&a, nbytes);

 ...

 cudaFreeHost(a);

● 记录 CPU 调用 CUDA 所用的时间

 StopWatchInterface *timer = NULL;

 sdkCreateTimer(&timer);

 sdkResetTimer(&timer);

 sdkStartTimer(&timer);

 ...// 核函数调用

 sdkStopTimer(&timer);

 printf("%.2f ms", sdkGetTimerValue(&timer));

● 查看GPU队列状态的函数

extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);

■ stop为放置到流中的一个事件，cudaEventQuery(stop)返回该事件的状态，等于cudaSuccess（值等于0）表示已经发生；等于cudaErrorNotReady（值等于35）表示尚未发生。源代码中利用这段时间让CPU空转，记录了迭代次数。

while (cudaEventQuery(stop) == cudaErrorNotReady) counter++;

● stdlib.h 中关于返回成功和失败的宏

 #define EXIT_SUCCESS 0

 #define EXIT_FAILURE  1

● 示例文件中的错误检查函数（定义在helper_cuda.h中），报告出错文件、行号、函数名，并且重启cudaDevice。

 #define checkCudaErrors(val)  check((val), #val, __FILE__, __LINE__)

 template< typename T >

 void check(T result, char const *const func, const char *const file, int const line)

 {

     if (result)

     {

         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",

             file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);

         DEVICE_RESET// Make sure we call CUDA Device Reset before exiting

         exit(EXIT_FAILURE);

     }

 }

 #define DEVICE_RESET  cudaDeviceReset();

0_Simple__asyncAPI的更多相关文章

随机推荐

win32多线程编程
关于多线程多进程的学习,有没有好的书籍我接触的书里头关于多线程多进程部分,一是<操作系统原理>里面讲的相关概念一个是<linux基础教程>里面讲的很简单的多线程多进程编程 ...
APUE 4 - 线程
对传统的UNIX进程来讲,一个进程中只有一个线程,这就意味着一个进程在同一时刻只能做一件事(即使是多核CPU).使用多线程技术, 我们可以设计程序使得一个进程在同一时刻做多件事.使用多线程编程具有以下 ...
记忆搜索与动态规划——DP背包问题
题目描述 01背包问题有n个重量和价值分别为\(w_i,v_i\)的物品.从这些物品中挑选出总重量不超过W的物品,求所有挑选方案中价值中总和的最大值. 限制条件 1 <= n <= 10 ...
Huge Mission
Huge Mission Problem Description Oaiei is busy working with his graduation design recently. If he ca ...
在ASP.NET Core中如何支持每个租户数据存储策略的数据库
在ASP.NET Core中如何支持每个租户数据存储策略的数据库不定时更新翻译系列,此系列更新毫无时间规律,文笔菜翻译菜求各位看官老爷们轻喷,如觉得我翻译有问题请挪步原博客地址本博文翻译自: ht ...
ERROR! MySQL server PID file could not be found!的解决方法
启动MySQL服务 [root@test vhosts]# /etc/init.d/mysqld restart 提示错误: ERROR! MySQL server PID file could no ...
Windows下Apache添加SSL模块
参考资料:http://www.yuansir-web.com/2011/05/12/hello-world/测试环境:windows2003 32位 + Apache2.4 + PHP5.4 一.准 ...
MySql中 where IN 字符串
正常where IN 字符串的时候会有问题但是我们经常会有一个字段中存了好几个甚至一堆的值 ,例如字段IDs(字符串类型)里面存了1,2,3,4 此时 FIND_IN_SET 就能解决我们这个棘 ...
Opencv处理鼠标事件-OpenCV步步精深
在图片上双击过的位置绘制一个圆圈鼠标事件就是和鼠标有关的,比如左键按下,左键松开,右键按下,右键松开,双击右键等等. 我们可以通过鼠标事件获得与鼠标对应的图片上的坐标.我们通过以下函数来调用查看所 ...
Django中添加富文本编辑器
使用的是CKeditor这个模块 1.安装: pip install django-ckeditor 2.将ckeditor注册到settings.py文件中, 并添加ckeditor的url到你项目 ...

0_Simple__asyncAPI

0_Simple__asyncAPI的更多相关文章

随机推荐

热门专题