▶ 使用 CUDA Runtime API,运行时编译,Driver API 三种接口计算向量加法

▶ 源代码,CUDA Runtime API

 #include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_cuda.h> #define ELEMENT 50000 __global__ void vectorAdd(const float *A, const float *B, float *C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
C[i] = A[i] + B[i];
} int main()
{
printf("\tStart.\n");
size_t size = ELEMENT * sizeof(float); float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
float *d_A = NULL;
float *d_B = NULL;
float *d_C = NULL;
cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);
for (int i = ; i < ELEMENT; ++i)
{
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); int threadsPerBlock = ;
int blocksPerGrid = (ELEMENT + threadsPerBlock - ) / threadsPerBlock;
vectorAdd << <blocksPerGrid, threadsPerBlock >> > (d_A, d_B, d_C, ELEMENT);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); for (int i = ; i < ELEMENT; ++i)
{
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-)
{
printf("\n\tResult error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f\n", i, h_A[i], h_B[i], h_C[i]);
getchar();
return ;
}
} free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
printf("\n\tFinish.\n");
getchar();
return ;
}

● 输出结果:

    Start.

    Finish.

▶ 源代码,运行时编译

 // vectorAdd_kernel.cu
extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
C[i] = A[i] + B[i];
}
 // vectorAdd.cpp
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <nvrtc_helper.h> #define ELEMENT 50000 int main()
{
printf("\n\tStart.\n"); char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\vectorAdd_nvrtc\\vectorAdd_kernel.cu";
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );
CUmodule module = loadPTX(ptx, , NULL);
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "vectorAdd"); size_t size = ELEMENT * sizeof(float); float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
CUdeviceptr d_A, d_B, d_C;
cuMemAlloc(&d_A, size);
cuMemAlloc(&d_B, size);
cuMemAlloc(&d_C, size);
for (int i = ; i < ELEMENT; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}
cuMemcpyHtoD(d_A, h_A, size);
cuMemcpyHtoD(d_B, h_B, size); int threadsPerBlock = ;
dim3 cudaBlockSize(threadsPerBlock,,);
dim3 cudaGridSize((ELEMENT + threadsPerBlock - ) / threadsPerBlock, , );
int element = ELEMENT;
void *arr[] = { (void *)&d_A, (void *)&d_B, (void *)&d_C, (void *)&element};
cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, , , &arr[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_C, d_C, size); for (int i = ; i < ELEMENT; ++i)
{
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-)
{
printf("\n\tResult error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f\n", i, h_A[i], h_B[i], h_C[i]);
getchar();
return ;
}
} free(h_A);
free(h_B);
free(h_C);
cuMemFree(d_A);
cuMemFree(d_B);
cuMemFree(d_C);
printf("\n\tFinish.\n");
getchar();
return ;
}

● 输出结果:

        Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Finish.

▶ 源代码,Driver API,也需要上面的 vectorAdd_kernel.cu,调用核函数有三种方式,中间那种有点问题,结果不对

 #include <stdio.h>
#include <helper_cuda.h>
#include <cuda.h>
#include <string>
#include <drvapi_error_string.h> #define ELEMENT 50000
#define PATH "C:\\ProgramData\\NVIDIA Corporation\\CUDA Samples\\v9.1\\0_Simple\\vectorAddDrv\\data\\" #if defined(_WIN64) || defined(__LP64__)
#define PTX_FILE "vectorAdd_kernel64.ptx"
#else
#define PTX_FILE "vectorAdd_kernel32.ptx"
#endif using namespace std; void RandomInit(float *data, int n)
{
for (int i = ; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
} int main(int argc, char **argv)
{
printf("\n\tStart.\n");
cuInit();// 相当于 runtime API 的 cudaSetDevice(0);,要先初始化设备才能创建上下文
CUcontext cuContext;
cuCtxCreate(&cuContext, , ); // 编译
string module_path, ptx_source;
module_path = PATH"vectorAdd_kernel64.ptx";
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, , SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + ];
fseek(fp, , SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf; CUmodule cuModule;
if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译
{
// 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值
const unsigned int jitNumOptions = ;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
// 编译日志长度
jitOptions[] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = ;
jitOptVals[] = (void *)(size_t)jitLogBufferSize;
// 编译日志内容
jitOptions[] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[] = jitLogBuffer;
// 设定一个内核使用的寄存器数量
jitOptions[] = CU_JIT_MAX_REGISTERS;
int jitRegCount = ;
jitOptVals[] = (void *)(size_t)jitRegCount;
// 编译模块
cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
//printf("> PTX JIT log:\n%s\n", jitLogBuffer);// 输出编译日志
delete[] jitLogBuffer;
delete[] jitOptVals;
delete[] jitOptions;
}
else// 使用的是 .cubin,不用编译(本例中不经过这个分支)
cuModuleLoad(&cuModule, module_path.c_str()); CUfunction vecAdd_kernel;
cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel");// 取出编译好的模块中的函数 // 申请内存,开始运算
int element = ELEMENT;
size_t size = ELEMENT * sizeof(float);
float * h_A, *h_B, *h_C;
CUdeviceptr d_A, d_B, d_C;
h_A = (float *)malloc(size);
h_B = (float *)malloc(size);
h_C = (float *)malloc(size);
RandomInit(h_A, ELEMENT);
RandomInit(h_B, ELEMENT);
cuMemAlloc(&d_A, size);
cuMemAlloc(&d_B, size);
cuMemAlloc(&d_C, size);
cuMemcpyHtoD(d_A, h_A, size);
cuMemcpyHtoD(d_B, h_B, size); int threadsPerBlock = ;
int blocksPerGrid = (ELEMENT + threadsPerBlock - ) / threadsPerBlock;
if () // 三种调用 Driver API 的方式
{
void *args[] = { &d_A, &d_B, &d_C, &element };
cuLaunchKernel(vecAdd_kernel, blocksPerGrid, , , threadsPerBlock, , , , NULL, args, NULL);
}
else if () // 有问题
{
int offset = ;
void *argBuffer[];
*((CUdeviceptr *)&argBuffer[offset]) = d_A;
offset += sizeof(d_A);
*((CUdeviceptr *)&argBuffer[offset]) = d_B;
offset += sizeof(d_B);
*((CUdeviceptr *)&argBuffer[offset]) = d_C;
offset += sizeof(d_C);
*((int *)&argBuffer[offset]) = element;
offset += sizeof(element);
cuLaunchKernel(vecAdd_kernel, blocksPerGrid, , , threadsPerBlock, , , , NULL, NULL, argBuffer);
}
else // 正确的
{
int offset = ;
char argBuffer[];
*((CUdeviceptr *)&argBuffer[offset]) = d_A;
offset += sizeof(d_A);
*((CUdeviceptr *)&argBuffer[offset]) = d_B;
offset += sizeof(d_B);
*((CUdeviceptr *)&argBuffer[offset]) = d_C;
offset += sizeof(d_C);
*((int *)&argBuffer[offset]) = element;
offset += sizeof(element);
void *kernel_launch_config[] =
{ CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,CU_LAUNCH_PARAM_BUFFER_SIZE,&offset,CU_LAUNCH_PARAM_END };
cuLaunchKernel(vecAdd_kernel, blocksPerGrid, , , threadsPerBlock, , , , NULL, NULL, (void **)&kernel_launch_config);
}
cuCtxSynchronize();
cuMemcpyDtoH(h_C, d_C, size);
int i;
for (i = ; i < ELEMENT; ++i)
{
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-7f)
{
printf("Error at i == %d, h_C[i] == %f, sum == %f", i, h_C[i], sum);
break;
}
}
printf("\n\tFinish: %s\n", (i == ELEMENT) ? "Pass" : "Fail");
getchar();
return ;
}

● 输出结果

    Start.

    Finish.

▶ 涨姿势:

● 从源代码中删减了的部分

 CUresult CleanupNoFailure() //检查内存错误的函数
{
CUresult error;
// Free device memory
if (d_A)
error = cuMemFree(d_A);
if (d_B)
error = cuMemFree(d_B);
if (d_C)
error = cuMemFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
error = cuCtxDestroy(cuContext);
return error;
} void Cleanup(bool noError) // 报告错误
{
CUresult error = CleanupNoFailure();
if (!noError || error != CUDA_SUCCESS)
{
printf("Function call failed\nFAILED\n");
exit(EXIT_FAILURE);
}
if (!noprompt)
{
printf("\nPress ENTER to exit...\n");
fflush(stdout);
fflush(stderr);
getchar();
}
} if (error != CUDA_SUCCESS) // 外部调用 cleanup
Cleanup(false); if (argc > ) // 主函数中使用参数 -device=n 指定设备号
{
bool bFound = false;
for (int param = ; param < argc; param++) // 逐个检查参数
{
int string_start = ;
while (argv[param][string_start] == '-') // 跳过 "-" 号
string_start++;
char *string_argv = &argv[param][string_start];
if (!strncmp(string_argv, "device", )) // 看参数是否是 device
{
int len = (int)strlen(string_argv);
while (string_argv[len] != '=')
len--;
devID = atoi(&string_argv[++len]);
bFound = true;
}
if (bFound)
break;
}
}

0_Simple__vectorAdd + 0_Simple__vectorAdd_nvrtc + 0_Simple__vectorAddDrv的更多相关文章

随机推荐

  1. HDU1423 Greatest Common Increasing Subsequence

    题意 如标题. \(|s1|,|s2| \leq 500\) 分析 既然是dp问题的组合,那么考虑dp. 定义状态f(i,j)表示对第一个序列s1的前i个和第二个序列s2的前j个元素求最长上升公共子序 ...

  2. BMP、GIF、JPEG、PNG以及其它图片格式简单介绍

    版权声明:本文为博主原创文章,未经博主同意不得转载. https://blog.csdn.net/songjinshi/article/details/37516649 BMP格式 BMP是英文Bit ...

  3. Java 发展历史

    Java自1995诞生,至今已经20多年的历史. Java的名字的来源:Java是印度尼西亚爪哇岛的英文名称,因盛产咖啡而闻名.Java语言中的许多库类名称,多与咖啡有关,如JavaBeans(咖啡豆 ...

  4. 【Bitmap Index】B-Tree索引与Bitmap位图索引的锁代价比较研究

    通过以下实验,来验证Bitmap位图索引较之普通的B-Tree索引锁的“高昂代价”.位图索引会带来“位图段级锁”,实际使用过程一定要充分了解不同索引带来的锁代价情况. 1.为比较区别,创建两种索引类型 ...

  5. 什么是Base64算法?什么情况下用Base64算法?

    base64 编码的本质:将 8bit 二进制数转化为 6bit 的可打印字符. Base64编码用于需要将二进制数据转为文本数据进行储存和传输的场景. Javascript内部的字符串,都以utf- ...

  6. IdentityHashMap 与 HashMap 的区别

    IdentityHashMap 中的 key 允许重复 IdentityHashMap 使用的是 == 比较 key 的值(比较内存地址),而 HashMap 使用的是 equals()(比较存储值) ...

  7. 最新hadoop虚拟机安装教程(附带图文)

    前两天看到有人留言问在什么情况下需要部署hadoop,我给的回答也很简单,就是在需要处理海量数据的时候才需要考虑部署hadoop.关于这个问题在很早之前的一篇分享文档也有说到这个问题,数据量少的完全发 ...

  8. Linq快速入门——Lambda表达式的前世今生

    Linq快速入门——Lambda表达式的前世今生   Lambda表达式其实并不陌生,他的前生就是匿名函数,所以要谈Lambda表达式,就不得不谈匿名函数,要谈匿名函数,那又要不得不谈委托. 何为委托 ...

  9. MySqli 执行多条SQL语句

    使用multi_query();  去执行SQL语句,执行多条语句多个SQL语句用“;”分开 一:没有结果集的语句: $sql="insert into products (cid,name ...

  10. Extjs下拉多选框

    //------录入时间,下拉列表框------ var inputTimeRow = new Ext.data.Record.create([ { name : 'value' },{ name : ...