0_Simple__simpleMultiCopy

利用 CUDA 的 Overlap 特性同时进行运算和数据拷贝来实现加速。

▶ 源代码。使用 4 个流一共执行 10 次 “数据上传 - 内核计算 - 数据下载” 过程，记录使用时间。

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_cuda.h>

 #include <helper_functions.h>

 #define STREAM_COUNT 4

 #define NREPS        10

 #define INNER_REPS   5

 int N =  << ;

 int memsize;

 int *h_data_source;

 int *h_data_sink;

 int *h_data_in[STREAM_COUNT];

 int *h_data_out[STREAM_COUNT];

 int *d_data_in[STREAM_COUNT];

 int *d_data_out[STREAM_COUNT];

 dim3 grid;

 dim3 block();

 cudaEvent_t cycleDone[STREAM_COUNT], start, stop;

 cudaStream_t stream[STREAM_COUNT];

 __global__ void incKernel(int *g_out, int *g_in, int size)

 {

     int idx = blockIdx.x * blockDim.x + threadIdx.x;

     if (idx < size)

     {

         for (int i = ; i < INNER_REPS; ++i)// 暴力重复 5 次，不会被编译器优化掉？

             g_out[idx] = g_in[idx] + ;

     }

 }

 float processWithStreams(int streams_used)

 {

     cudaEventRecord(start, );

     for (int i = , current_stream = ; i < NREPS; ++i)

     {

         int next_stream = (current_stream + ) % streams_used;

 #ifdef SIMULATE_IO// ？

         // 改变下载数据

         memcpy(h_data_sink, h_data_out[current_stream], memsize);

         // 改变上传数据

         memcpy(h_data_in[next_stream], h_data_source, memsize);

 #endif

         // 保证上一次循环中位于流 next_stream 中的任务已经完成

         cudaEventSynchronize(cycleDone[next_stream]);

         // 执行当前流的内核

         incKernel << <grid, block, , stream[current_stream] >> > (d_data_out[current_stream], d_data_in[current_stream], N);

         // 执行下一个流的数据上传

         cudaMemcpyAsync(d_data_in[next_stream],h_data_in[next_stream],memsize,cudaMemcpyHostToDevice,stream[next_stream]);

         // 执行当前流的数据下载

         cudaMemcpyAsync(h_data_out[current_stream],d_data_out[current_stream],memsize,cudaMemcpyDeviceToHost,stream[current_stream]);

         cudaEventRecord(cycleDone[current_stream],stream[current_stream]);

         current_stream = next_stream;

     }

     cudaEventRecord(stop, );

     cudaDeviceSynchronize();

     float time;

     cudaEventElapsedTime(&time, start, stop);

     return time;

 }

 bool test()

 {

     bool passed = true;

     for (int j = ; j<STREAM_COUNT; ++j)

     {

         for (int i = ; i < N; ++i)

             passed &= (h_data_out[j][i] == );

     }

     return passed;

 }

 int main(int argc, char *argv[])

 {

     printf("\n\tStart.\n"); 

     // 挑选设备和分析设备性能

     int cuda_device = ;

     cudaDeviceProp deviceProp;

     if (checkCmdLineFlag(argc, (const char **)argv, "device"))

     {

         if ((cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=")) < )

         {

             printf("Invalid command line parameters\n");

             exit(EXIT_FAILURE);

         }

         else

         {

             printf("cuda_device = %d\n", cuda_device);

             if ((cuda_device = gpuDeviceInit(cuda_device)) < )

             {

                 printf("No CUDA Capable devices found, exiting...\n");

                 exit(EXIT_SUCCESS);

             }

         }

     }

     else

     {

         cuda_device = gpuGetMaxGflopsDeviceId();

         cudaSetDevice(cuda_device);

         cudaGetDeviceProperties(&deviceProp, cuda_device);

         printf("\n\tDevice [%d]: %s, computation cability %d.%d, ", cuda_device, deviceProp.name, deviceProp.major, deviceProp.minor);

     }

     cudaGetDeviceProperties(&deviceProp, cuda_device);

     printf("%d MP(s) x %d (Cores/MP) = %d (Cores)\n",

            deviceProp.multiProcessorCount,

            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),// 依计算能力反应流处理器个数情况，定义于 helper_cuda.h

            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

     printf("\n\tRelevant properties of this CUDA device.\n");

     printf("\t(%c) Execute several GPU kernels simultaneously\n", deviceProp.major >=  ? 'Y' : 'N');

     printf("\t(%c) Overlap one CPU<->GPU data transfer with GPU kernel execution\n", deviceProp.deviceOverlap ? 'Y' : 'N');

     printf("\t(%c) Overlap two CPU<->GPU data transfers with GPU kernel execution\n",(deviceProp.major >=  && deviceProp.asyncEngineCount > )? 'Y' : 'N');

     // 如果流处理器个数少于 32，则降低工作负荷

     float scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);

     N = (int)((float)N / scale_factor);

     printf("\n\tscale_factor = %.2f\n\tarray_size   = %d\n", 1.0f / scale_factor, N);

     // 准备运行配置

     memsize = N * sizeof(int);

     int thread_blocks = N / block.x;

     grid.x = thread_blocks % ;

     grid.y = (thread_blocks /  + );      

     h_data_source = (int *) malloc(memsize);

     h_data_sink = (int *) malloc(memsize);

     for (int i = ; i < STREAM_COUNT; ++i)

     {

         cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault);

         cudaMalloc(&d_data_in[i], memsize);

         cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault);

         cudaMalloc(&d_data_out[i], memsize);

         cudaStreamCreate(&stream[i]);

         cudaEventCreate(&cycleDone[i]);

         cudaEventRecord(cycleDone[i], stream[i]);

     }

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     // 初始化 h_data_source 和 h_data_in

     for (int i = ; i<N; ++i)

         h_data_source[i] = ;

     for (int i = ; i < STREAM_COUNT; ++i)

         memcpy(h_data_in[i], h_data_source, memsize);

     // 预跑

     incKernel<<<grid, block>>>(d_data_out[], d_data_in[], N);

     // 各种测试

     cudaEventRecord(start,);

     cudaMemcpyAsync(d_data_in[], h_data_in[], memsize, cudaMemcpyHostToDevice, );

     cudaEventRecord(stop,);

     cudaEventSynchronize(stop);

     float memcpy_h2d_time;

     cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

     cudaEventRecord(start,);

     cudaMemcpyAsync(h_data_out[], d_data_out[], memsize, cudaMemcpyDeviceToHost, );

     cudaEventRecord(stop,);

     cudaEventSynchronize(stop);

     float memcpy_d2h_time;

     cudaEventElapsedTime(&memcpy_d2h_time, start, stop);

     cudaEventRecord(start,);

     incKernel<<<grid, block,,>>>(d_data_out[], d_data_in[], N);

     cudaEventRecord(stop,);

     cudaEventSynchronize(stop);

     float kernel_time;

     cudaEventElapsedTime(&kernel_time, start, stop);

     printf("\n\tMeasured timings (throughput):\n");

     printf("\tMemcpy host to device:\t%f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-) / memcpy_h2d_time);

     printf("\tMemcpy device to host:\t%f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-) / memcpy_d2h_time);

     printf("\tKernel:               \t%f ms (%f GB/s)\n", kernel_time, (INNER_REPS *memsize * 2e-) / kernel_time); 

     printf("\n\tTheoretical limits for speedup gained from overlapped data transfers:\n");

     printf("\tNo overlap (transfer-kernel-transfer):\t%f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);

     printf("\tOverlap one transfer:                 \t%f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));

     printf("\tOverlap both data transfers:          \t%f ms\n", max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));

     // 使用 Overlap 特性进行计算

     float serial_time = processWithStreams();

     float overlap_time = processWithStreams(STREAM_COUNT);

     printf("\n\tAverage measured timings over %d repetitions:\n", NREPS);

     printf("\tAvg. time serialized:      \t%f ms (%f GB/s)\n", serial_time / NREPS, (NREPS * (memsize * 2e-)) / serial_time);

     printf("\tAvg. time using %d streams:\t%f ms (%f GB/s)\n", STREAM_COUNT, overlap_time / NREPS, (NREPS * (memsize * 2e-)) / overlap_time);

     printf("\tAvg. speedup gained:       \t%f ms\n", (serial_time - overlap_time) / NREPS);

     printf("\n\tResult test: %s.\n", test() ? "Passed" : "Failed");

     // 回收工作

     free(h_data_source);

     free(h_data_sink);

     for (int i =; i<STREAM_COUNT; ++i)

     {

         cudaFreeHost(h_data_in[i]);

         cudaFree(d_data_in[i]);

         cudaFreeHost(h_data_out[i]);

         cudaFree(d_data_out[i]);

         cudaStreamDestroy(stream[i]);

         cudaEventDestroy(cycleDone[i]);

     }

     cudaEventDestroy(start);

     cudaEventDestroy(stop);

     getchar();

     return ;

 }

▶ 输出结果

    Start.

    Device []: GeForce GTX , computation cability 6.1,  MP(s) x  (Cores/MP) =  (Cores)

    Relevant properties of this CUDA device.

    (Y) Execute several GPU kernels simultaneously

    (Y) Overlap one CPU<->GPU data transfer with GPU kernel execution

    (Y) Overlap two CPU<->GPU data transfers with GPU kernel execution

    scale_factor = 1.00

    array_size   = 

    Measured timings (throughput):

    Memcpy host to device:  1.276192 ms (13.146311 GB/s)

    Memcpy device to host:  1.279008 ms (13.117366 GB/s)

    Kernel:                 1.312768 ms (127.800314 GB/s)

    Theoretical limits for speedup gained from overlapped data transfers:

    No overlap (transfer-kernel-transfer):  3.867968 ms

    Overlap one transfer:                   2.555200 ms

    Overlap both data transfers:            1.312768 ms

    Average measured timings over  repetitions:

    Avg. time serialized:           3.992167 ms (8.405068 GB/s)

    Avg. time using  streams:      1.896141 ms (17.696171 GB/s)

    Avg. speedup gained:            2.096026 ms

    Result test: Passed.

▶ 涨姿势

● 没有

0_Simple__simpleMultiCopy的更多相关文章

随机推荐

War Chess bfs+优先队列
War chess is hh's favorite game: In this game, there is an N * M battle map, and every player has hi ...
2018-2019-2 20165212《网络攻防技术》Exp5 MSF基础应用
2018-2019-2 20165212<网络攻防技术>Exp5 MSF基础应用攻击成果主动攻击的实践 ms17010eternalblue payload windows/x64/m ...
Weston学习
来源网址:http://blog.csdn.net/fyh2003/article/details/49253713 Weston启动时会读取weston.ini这个配置文件,其中可以配置桌面,动画和 ...
Ribbon Status Bar
https://documentation.devexpress.com/#WindowsForms/CustomDocument2498 官方文档说明 A Ribbon Status Bar (Ri ...
记录 ThinkPHP 5.* 漏洞修复后的情况
记录 ThinkPHP 5.* 漏洞修复后的情况 ThinkPHP 官方 2018-12-09 下午收到漏洞报告. 2018-12-09 晚上看到 Git 已经更新了,修复了漏洞. 2018-12-1 ...
调试 FastAdmin 出现 Failed to parse SourceMap
看到群里有人说在调试 FastAdmin出现 SourceMap 出错. 报错信息为: Failed to parse SourceMap 来自 Karson 说明: 这个文件是用于匹配原有less中 ...
ionic3之自定义tabs菜单图标
请参考以下链接 ionic3之自定义tabs菜单图标
Microsoft Dynamics CRM 2011 Plugin中PluginExecutionContext.InputParameters["Target"]中的Target是从哪来的？
图 1 如图1,CRM编程是一个请求响应模型,任何操作都是通过一个Request发起,一个Response返回结果,这个模型简单实用.所有请求类都是继承OrganizationRequest,所有响应 ...
Windows平台下的线程同步
引子: 这几天在写一个windows phone平台上的service,由于Windows phone 的内核是基于Windows NT的,这也意味着写Windows Phone的Service代码与 ...
Django url反向解析与路由分发名称空间
url反向解析 url.py from django.conf.urls import url from django.contrib import admin from app01 import v ...

0_Simple__simpleMultiCopy

0_Simple__simpleMultiCopy的更多相关文章

随机推荐

热门专题