0_Simple__clock + 0_Simple__clock

▶ 使用 clock() 函数在CUDA核函数内部进行计时，将核函数封装为PTX并在另外的代码中读取和使用。

▶ 源代码：文件内建核函数计时

 #include <stdio.h>

 #include <stdlib.h>

 #include <time.h>

 #include <cuda.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_cuda.h>

 #include <helper_string.h>

 #define NUM_BLOCKS    1

 #define NUM_THREADS   1024

 __global__ static void timedReduction(const float *input, float *output, clock_t *timer)

 {

     extern __shared__ float shared[];

     const int tid = threadIdx.x, bid = blockIdx.x;

     if (tid == )                                           // 0 号线程记录开始时间，调用 time.h 的计时器，一个线程块有一个开始时间和一个结束时间

         timer[bid] = clock();

     shared[tid] = input[tid];

     shared[tid + blockDim.x] = input[tid + blockDim.x];

     for (int d = blockDim.x; d > ; d /= )                 // 二分规约求最小值，每次循环后较小值保存在前半段上

     {

         if (tid < d)

         {

             float f0 = shared[tid], f1 = shared[tid + d];

             if (f1 < f0)

                 shared[tid] = f1;

         }

         __syncthreads();

     }

     if (tid == )                                           // 0 号线程输出结果

         output[bid] = shared[];

     __syncthreads();

     if (tid == )                                           // 0 号线程记录结束时间

         timer[bid + gridDim.x] = clock();

 }

 int main(int argc, char **argv)

 {

     int dev = findCudaDevice(argc, (const char **)argv);    // helper_cuda.h 中设置设备的函数

     clock_t timer[NUM_BLOCKS * ];

     float input[NUM_THREADS * ];

     for (int i = ; i < NUM_THREADS * ; i++)

         input[i] = (float)i;

     float *dinput = NULL, *doutput = NULL;

     clock_t *dtimer = NULL;

     cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * );

     cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS);

     cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * );

     cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * , cudaMemcpyHostToDevice);

     timedReduction << <NUM_BLOCKS, NUM_THREADS, sizeof(float) *  * NUM_THREADS >> >(dinput, doutput, dtimer);

     cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * , cudaMemcpyDeviceToHost);

     cudaFree(dinput);

     cudaFree(doutput);

     cudaFree(dtimer);

     long double sumElapsedClocks = ;                       // 计算平均耗时

     for (int i = ; i < NUM_BLOCKS; i++)

         sumElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);

     printf("Average clocks/block = %f\n", sumElapsedClocks / NUM_BLOCKS);

     getchar();

     return EXIT_SUCCESS;

 }

● 输出结果，比较不同的 blockDim.x 和 threadDim.x 情况结果如下图表所示。

▶ 涨姿势：

● 在核函数中也能使用 time.h 中的 clock_t 变量，并用clock() 函数计时。

▶ 源代码，封装核函数并在另外的代码中使用。分成核函数部分 clock_fernel.cu 和主函数部分 clock.cpp

 // clock_kernel.cu

 __global__ static void timedReduction(const float *input, float *output, clock_t *timer)

 {

     extern __shared__ float shared[];

     const int tid = threadIdx.x, bid = blockIdx.x;

     if (tid == )

         timer[bid] = clock();                               

     shared[tid] = input[tid];

     shared[tid + blockDim.x] = input[tid + blockDim.x];

     for (int d = blockDim.x; d > ; d /= )

     {

         if (tid < d)

         {

             float f0 = shared[tid], f1 = shared[tid + d];

             if (f1 < f0)

                 shared[tid] = f1;

         }

         __syncthreads();

     }

     if (tid == )

         output[bid] = shared[];

     __syncthreads();

     if (tid == )

         timer[bid + gridDim.x] = clock();

 }

 // main.c

 #include <stdio.h>

 #include <cuda.h>

 #include <cuda_runtime.h>

 #include <driver_functions.h>

 #include <nvrtc_helper.h>

 #define NUM_BLOCKS    64

 #define NUM_THREADS   256

 int main(int argc, char **argv)

 {

     typedef long clock_t;

     clock_t timer[NUM_BLOCKS * ];

     float input[NUM_THREADS * ];

     for (int i = ; i < NUM_THREADS * ; i++)

         input[i] = (float)i;

     char *kernel_file = sdkFindFilePath("clock_kernel.cu", argv[]);        // 找到核函数代码文件

     char *ptx;

     size_t ptxSize;

     compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );              // 将指定核函数编译为 PTX，放在指针 ptx 指向的地址，大小为ptxSize

     CUmodule module = loadPTX(ptx, argc, argv);                             // 读取编译好的 PTX

     CUfunction kernel_name;

     cuModuleGetFunction(&kernel_name, module, "timedReduction");            // 取出 PTX 中的函数 timeReducetion()

     CUdeviceptr dinput, doutput, dtimer;                                    // 内存申请和拷贝

     cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * );

     cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS);

     cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * );

     cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * );

     dim3 cudaGridSize(NUM_BLOCKS, , ), cudaBlockSize(NUM_THREADS, , );

     void *arr[] = { (void *)&dinput, (void *)&doutput, (void *)&dtimer };   // 封装核函数实参的指针

     cuLaunchKernel(kernel_name,                                             // 调用核函数，函数名

         cudaGridSize.x, cudaGridSize.y, cudaGridSize.z,                     // gridDim 分量

         cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z,                  // blockDim 分量

         sizeof(float) *  * NUM_THREADS, ,                                 // 共享内存和流号

         &arr[], );                                                        // 实参和其他参数

     cuCtxSynchronize();                                                     // 上下文同步，作用接近 cudaDeviceSynchronize()

     cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * );

     cuMemFree(dinput);

     cuMemFree(doutput);

     cuMemFree(dtimer);

     long double sumElapsedClocks = ;                                       // 计算耗时

     for (int i = ; i < NUM_BLOCKS; i++)

         sumElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);

     printf("Average clocks/block = %Lf\n", sumElapsedClocks / NUM_BLOCKS);

     getchar();

     return EXIT_SUCCESS;

 }

● 输出结果：

sdkFindFilePath <clock_kernel.cu> in ./

> Using CUDA Device []: GeForce GTX

> GPU Device has SM 6.1 compute capability

Average clocks/block = 3058.000000

▶ 涨姿势：

● 在外部核函数代码文件中采用 extern "C" __global__ void functionName() 来定义函数

● 使用 PTX 过程中涉及的函数

 // 依文件名搜索其绝对路径，传入需要查找的目标文件名 filename 和可选的可执行文件目录 executable_path

 inline char *sdkFindFilePath(const char *filename, const char *executable_path)

 {

     const char *searchPath[] = { "./" };                                    // 默认搜索路径只有当前目录，源代码中罗列了很多文件目录

     std::string executable_name;

     if (executable_path != )

     {

         executable_name = std::string(executable_path);

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)  // 注意 Windows 和 Linux 文件路径的分隔符不同

         size_t delimiter_pos = executable_name.find_last_of('\\');

         executable_name.erase(, delimiter_pos + );

         if (executable_name.rfind(".exe") != std::string::npos)

             executable_name.resize(executable_name.size() - );

 #else

         size_t delimiter_pos = executable_name.find_last_of('/');

         executable_name.erase(, delimiter_pos + );

 #endif

     }

     for (unsigned int i = ; i < sizeof(searchPath) / sizeof(char *); ++i)  // 遍历查找路径，找到第一个匹配的路径

     {

         std::string path(searchPath[i]);

         size_t executable_name_pos = path.find("<executable_name>");

         if (executable_name_pos != std::string::npos)

         {

             if (executable_path != )                                       // 额外路径非空，替换掉path中的值

                 path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);

             else                                                            // 额外路径为空，不做调整

                 continue;

         }

 #ifdef _DEBUG

         printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());

 #endif        

         path.append(filename);                                              // 根据搜索的结果测试文件是否存在

         FILE *fp;

         FOPEN(fp, path.c_str(), "rb");                                      // 在 helper_strings.h 中 #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)

         if (fp != NULL)

         {

             fclose(fp);

             char *file_path = (char *)malloc(path.length() + );

             STRCPY(file_path, path.length() + , path.c_str());             // #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)

             return file_path;

         }

         if (fp)                                                             // ？

             fclose(fp);

     }

     return ;                                                               // 没有找到文件，返回 0

 }

 // 文本编译为PTX，输入文件名，编译参数，指向编译结果的指针，指向存放编译结果大小的指针

 // 新版本中还多了一个参数 int requiresCGheaders，是否要用到 cooperative_groups.h，即向编译选项中添加 --include-path=cooperative_groups.h

 void compileFileToPTX(char *filename, int argc, const char **argv, char **ptxResult, size_t *ptxResultSize)

 {

     std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate);

     if (!inputFile.is_open())

     {

         std::cerr << "\nerror: unable to open " << filename << " for reading!\n";

         exit();

     }

     std::streampos pos = inputFile.tellg();

     size_t inputSize = (size_t)pos;

     char * memBlock = new char[inputSize + ];

     inputFile.seekg(, std::ios::beg);

     inputFile.read(memBlock, inputSize);

     inputFile.close();

     memBlock[inputSize] = '\x0';

     nvrtcProgram prog;                                                                                  // 编译

     NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog, memBlock, filename, , NULL, NULL));

     nvrtcResult res = nvrtcCompileProgram(prog, argc, argv);

     size_t logSize;                                                                                     // 写日志

     NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize));

     char *log = (char *)malloc(sizeof(char) * logSize + );

     NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));

     log[logSize] = '\x0';

     //std::cerr << "\n compilation log ---\n";

     //std::cerr << log;

     //std::cerr << "\n end log ---\n";

     free(log);

     NVRTC_SAFE_CALL("nvrtcCompileProgram", res);

     // fetch PTX

     size_t ptxSize;

     NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxSize));

     char *ptx = (char *)malloc(sizeof(char) * ptxSize);

     NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx));

     NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));

     *ptxResult = ptx;

     *ptxResultSize = ptxSize;

 }

 // 传入文件名和错误信息内容，向std_err中输出

 #define NVRTC_SAFE_CALL(Name, x)                                                                        \

     do                                                                                                  \

     {                                                                                                   \

         nvrtcResult result = x;                                                                         \

         if (result != NVRTC_SUCCESS)                                                                    \

         {                                                                                               \

             std::cerr << "\nerror: " << Name << " failed with error " << nvrtcGetErrorString(result);   \

             exit();                                                                                    \

         }                                                                                               \

     } while()

 // 读取编译好的PTX为模块，传入指向 ptx 代码的指针和额外参数

 CUmodule loadPTX(char *ptx, int argc, char **argv)

 {

     CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);// 查找设备，返回设备信息

     int major = , minor = ;

     char deviceName[];

     cuDeviceComputeCapability(&major, &minor, cuDevice);

     cuDeviceGetName(deviceName, , cuDevice);

     printf("> GPU Device has SM %d.%d compute capability\n", major, minor);

     cuInit();                                  // cuda 设备初始化 CUresult CUDAAPI cuInit(unsigned int Flags);

     cuDeviceGet(&cuDevice, );                  // 返回设备编号 CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);

     CUcontext context;

     cuCtxCreate(&context, , cuDevice);         // 创建上下文 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);

     CUmodule module;

     cuModuleLoadDataEx(&module, ptx, , , );  // 读取模块信息 CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);

     return module;

 }

 CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);   // 从模块中取出指定的函数

 CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);                            // 类似cudaMalloc

 // 调用核函数的完整格式

 CUresult CUDAAPI cuLaunchKernel(CUfunction f,

     unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,

     unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);

 CUresult CUDAAPI cuCtxSynchronize(void);                                                    // 上下文同步

 CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);                                               // 类似 cudaFree

0_Simple__clock + 0_Simple__clock_nvrtc的更多相关文章

随机推荐

如何解决Python.h：No such file or directory
安装python2.7对应的dev sudo apt-get install python-dev 安装python3.6对应的dev sudo apt-get install python3-dev
shell查找指定时间段内的文件
#!/bin/bash#20170905 输入参数格式echo "显示"$1"的备份文件"date_0=$1date_1=`expr $date_0 + 1`d ...
快速搭建属于自己的数据库——mongodb
为了真实模拟一个项目上线,拥有前端后端数据库都具备的功能,我选择了mongodb作为项目的数据库支持,这里分享一些mongodb的经验心得和血的教训. mongoddb安装在本地安装直接通过官网下 ...
Codecraft-17 and Codeforces Round #391 (Div. 1 + Div. 2, combined)D. Felicity's Big Secret Revealed
题目连接:http://codeforces.com/contest/757/problem/D D. Felicity's Big Secret Revealed time limit per te ...
ASP.Net MVC 布局页模板页使用方法详细说明
一.Views文件夹 -> Shared文件夹下的 _Layout.cshtml 母版页 @RenderBody 当创建基于_Layout.cshtml布局页面的视图时,视图的内容会和布局页面合 ...
clone github报Permission denied (publickey) 解决方案
问题描述问题产生的原因,不是很清楚,就不管了.在执行git clone git@github.com:****.git 的时候报了Permission denied (publickey). War ...
python web框架之Tornado
说Tornado之前分享几个前端不错的网站: -- Bootstrap http://www.bootcss.com/ -- Font Awesome http://fontawesome.io/ - ...
win10 sdk 是否向下兼容
向下兼容(downward compatibility),又称向后兼容(backward compatibility).回溯兼容,在计算机中指在一个程序.库或硬件更新到较新版本后,用旧版本程序创建的文 ...
Developing Universal Windows Apps 开发UWA应用问答
开始是一些欢迎,就不翻译 Question: Is the code already there? Answer: There is some code on that codeplex site, ...
阻塞IO
服务端 from socket import * server = socket(AF_INET,SOCK_STREAM) server.bind(('127.0.0.1',8080)) server ...

0_Simple__clock + 0_Simple__clock_nvrtc

0_Simple__clock + 0_Simple__clock_nvrtc的更多相关文章

随机推荐

热门专题