使用 C++ 的模板

▶ 源代码:静态使用

 // sharedmem.cuh
#ifndef _SHAREDMEM_H_
#define _SHAREDMEM_H_ // SharedMemory 的封装
template <typename T> struct SharedMemory
{
__device__ T *getPointer()
{
extern __device__ void error(void);
error();
return NULL;
}
}; // SharedMemory 的各种数据类型的实现
template <> struct SharedMemory <int>
{
__device__ int *getPointer()
{
extern __shared__ int s_int[];
return s_int;
}
}; template <> struct SharedMemory <unsigned int>
{
__device__ unsigned int *getPointer()
{
extern __shared__ unsigned int s_uint[];
return s_uint;
}
}; template <> struct SharedMemory <char>
{
__device__ char *getPointer()
{
extern __shared__ char s_char[];
return s_char;
}
}; template <> struct SharedMemory <unsigned char>
{
__device__ unsigned char *getPointer()
{
extern __shared__ unsigned char s_uchar[];
return s_uchar;
}
}; template <> struct SharedMemory <short>
{
__device__ short *getPointer()
{
extern __shared__ short s_short[];
return s_short;
}
}; template <> struct SharedMemory <unsigned short>
{
__device__ unsigned short *getPointer()
{
extern __shared__ unsigned short s_ushort[];
return s_ushort;
}
}; template <> struct SharedMemory <long>
{
__device__ long *getPointer()
{
extern __shared__ long s_long[];
return s_long;
}
}; template <> struct SharedMemory <unsigned long>
{
__device__ unsigned long *getPointer()
{
extern __shared__ unsigned long s_ulong[];
return s_ulong;
}
}; template <> struct SharedMemory <bool>
{
__device__ bool *getPointer()
{
extern __shared__ bool s_bool[];
return s_bool;
}
}; template <> struct SharedMemory <float>
{
__device__ float *getPointer()
{
extern __shared__ float s_float[];
return s_float;
}
}; template <> struct SharedMemory <double>
{
__device__ double *getPointer()
{
extern __shared__ double s_double[];
return s_double;
}
}; #endif
 // simpleTemplates.cu
#include <stdio.h>
#include <timer.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h>
#include "sharedmem.cuh" template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
{
SharedMemory<T> smem;
T *sdata = smem.getPointer();
// 以上两行结合,等效于 extern __shared__ T sdata[];
const unsigned int tid = threadIdx.x; sdata[tid] = g_idata[tid];
__syncthreads();
sdata[tid] = (T) blockDim.x * sdata[tid];
__syncthreads();
g_odata[tid] = sdata[tid];
} template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
{
const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
for (unsigned int i = ; i < len; ++i)
reference[i] = idata[i] * T_len;
} // ArrayComparator 的封装
template<class T> class ArrayComparator
{
public:
bool compare(const T *reference, T *data, unsigned int len)
{
fprintf(stderr, "Error: no comparison function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
template<> class ArrayComparator<int>
{
public:
bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
}; template<> class ArrayComparator<float>
{
public:
bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
}; // ArrayFileWriter 的封装
template<class T> class ArrayFileWriter
{
public:
bool write(const char *filename, T *data, unsigned int len, float epsilon)
{
fprintf(stderr, "Error: no file write function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
template<> class ArrayFileWriter<int>
{
public:
bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<> class ArrayFileWriter<float>
{
public:
bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<class T> bool test(int len)
{
unsigned int mem_size = sizeof(T) * len;
dim3 grid(, , );
dim3 threads(len, , );
ArrayComparator<T> comparator;
ArrayFileWriter<T> writer;
cudaSetDevice();
StartTimer(); // 申请内存
T *h_idata, *h_odata, *d_idata, *d_odata;
h_idata = (T *)malloc(mem_size);
h_odata = (T *)malloc(mem_size);
cudaMalloc((void **)&d_idata, mem_size);
cudaMalloc((void **)&d_odata, mem_size);
for (unsigned int i = ; i < len; ++i)
h_idata[i] = (T) i;
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice); // 计算和计时
testKernel<T> << < grid, threads, mem_size >> > (d_idata, d_odata);
cudaMemcpy(h_odata, d_odata, sizeof(T) * len, cudaMemcpyDeviceToHost);
printf("\n\tProcessing time: %f ms\n", GetTimer()); // 检查结果
computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
bool result = comparator.compare(h_idata, h_odata, len);
//writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);// 写入文件的部分 free(h_idata);
free(h_odata);
cudaFree(d_idata);
cudaFree(d_odata);
return result;
} int main()
{
printf("\n\tStart.\n");
printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
printf("\n\t> test<float, 64>, result: %s.\n", test<float>() ? "Passed" : "Failed"); getchar();
return ;
}

▶ 输出结果:

    Start.

    Processing time: 107.394216 ms

    > test<float, >, result: Passed.

    Processing time: 3.153182 ms

    > test<float, >, result: Passed.

▶ 源代码:使用运行时编译

 // sharedmem.cuh,与静态完全相同
 // simpleTemplates_kernel.cu
#include "sharedmem.cuh" template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
{
SharedMemory<T> smem;
T *sdata = smem.getPointer();
// 以上两行结合,等效于 extern __shared__ T sdata[];
const unsigned int tid = threadIdx.x; sdata[tid] = g_idata[tid];
__syncthreads();
sdata[tid] = (T)blockDim.x * sdata[tid];
__syncthreads();
g_odata[tid] = sdata[tid];
} extern "C" __global__ void testFloat(float *p1, float *p2) { testKernel<float>(p1, p2); } extern "C" __global__ void testInt(int *p1, int *p2) { testKernel<int>(p1, p2); }
 // simpleTemplates.cpp
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <nvrtc_helper.h>
#include <timer.h> template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
{
const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
for (unsigned int i = ; i < len; ++i)
reference[i] = idata[i] * T_len;
} // ArrayComparator 的封装
template<class T> class ArrayComparator
{
public:
bool compare(const T *reference, T *data, unsigned int len)
{
fprintf(stderr, "Error: no comparison function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
template<> class ArrayComparator<int>
{
public:
bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
}; template<> class ArrayComparator<float>
{
public:
bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
}; // ArrayFileWriter 的封装
template<class T> class ArrayFileWriter
{
public:
bool write(const char *filename, T *data, unsigned int len, float epsilon)
{
fprintf(stderr, "Error: no file write function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
template<> class ArrayFileWriter<int>
{
public:
bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<> class ArrayFileWriter<float>
{
public:
bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; // getKernel 的模板
template <typename T> CUfunction getKernel(CUmodule in); template<> CUfunction getKernel<int>(CUmodule in)
{
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, in, "testInt");
return kernel_addr;
} template<> CUfunction getKernel<float>(CUmodule in)
{
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, in, "testFloat");
return kernel_addr;
} template<class T> bool test(int len)
{
// 与静态不同,编译 PTX
char *kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTemplates_nvrtc\\simpleTemplates_kernel.cu";
char *ptx;
size_t ptxSize;
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, ); // 1, NULL 分别为 argc 和 argv
CUmodule module = loadPTX(ptx, , NULL); // 1, NULL 分别为 argc 和 argv,有关于 GPU的输出 unsigned int mem_size = sizeof(T) * len;
dim3 grid(, , );
dim3 threads(len, , );
ArrayComparator<T> comparator;
ArrayFileWriter<T> writer;
StartTimer(); // 申请内存
T *h_idata, *h_odata;
CUdeviceptr d_idata, d_odata; // 与静态不同
h_idata = (T *)malloc(mem_size);
h_odata = (T *)malloc(mem_size);
cuMemAlloc(&d_idata, mem_size); // 与静态不同
cuMemAlloc(&d_odata, mem_size);
for (unsigned int i = ; i < len; ++i)
h_idata[i] = (T)i;
cuMemcpyHtoD(d_idata, h_idata, mem_size); // 与静态不同 // 计算和计时
CUfunction kernel_addr = getKernel<T>(module); void *arr[] = { (void *)&d_idata, (void *)&d_odata };
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, threads.x, threads.y, threads.z, mem_size, , &arr[], );
cuCtxSynchronize(); // 上下文同步
cuMemcpyDtoH(h_odata, d_odata, sizeof(T) * len);// 与静态不同
printf("\n\tProcessing time: %f ms\n", GetTimer()); // 检查结果
computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
bool result = comparator.compare(h_idata, h_odata, len);
//writer.write("./data/regression.dat", h_odata, len, 0.0f);// 写入文件的部分 free(h_idata);
free(h_odata);
cuMemFree(d_idata); // 与静态不同
cuMemFree(d_odata);
return result;
} int main()
{
printf("\n\tStart.\n");
printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
printf("\n\t> test<int, 64>, result: %s.\n", test<int>() ? "Passed" : "Failed"); getchar();
return ;
}

▶ 输出结果:

    Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Processing time: 0.699976 ms > test<float, >, result: Passed.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Processing time: 0.665355 ms > test<int, >, result: Passed.

▶ 涨姿势

● 封装了 SharedMemory,ArrayComparator,ArrayFileWriter 三个模板,并定义了其在不同的数据类型下的实现。

0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc的更多相关文章

随机推荐

  1. Codelf 变量取名

    Codelf 变量取名 可以看到别的变量是怎么命名的,站在巨人的肩膀上.

  2. centos 下nginx源码编译安装

    1.下载nginx 进入nginx官网下载nginx的稳定版本,我下载的是1.10.3. 下载:wget http://nginx.org/download/nginx-1.10.3.tar.gz 解 ...

  3. CF 914F Substrings in a String——bitset处理匹配

    题目:http://codeforces.com/contest/914/problem/F 可以对原字符串的每种字母开一个 bitset .第 i 位的 1 表示这种字母在第 i 位出现了. 考虑能 ...

  4. Java static 使用

    1. 静态代码块 class H { static{ Sysout.out.println("static block"); } } 静态代码块先与构造函数执行 静态代码块: 静态 ...

  5. react组件的创建

    最近项目接触react和rn,之前会一些vue和小程序,起初写react是很难受的,尤其是jsx的写法,不过2周过后感觉写起来有点舒服了... 目前react的组件一共有3种方式:React.crea ...

  6. file_get_contents是打工文件或URL获取内容的方法,比其稳定的还有curl_get_contents

    相信使用过file_get_contents函数的朋友都知道,当获取的$url访问不了时,会导致页面漫长的等待,甚至还能导致PHP进程占用CPU达100%,因此这个函数就诞生了 分享一个实际在用的函数 ...

  7. xshell连不上虚拟机linux的解决办法(用的默认NAT模式)

    1.找到Linux系统的ip地址 输入命令   ifconfig 2.打开本地网络连接 将VMnet1的ip地址设置为和虚拟机ip同一网段的ip 比如虚拟机Linux系统的ip为   192.168. ...

  8. dubbo 官方参考手册~备案(防止哪天阿里一生气把dubbo给删除了)

          首页  ||  下载  ||  用户指南  ||  开发者指南  ||  管理员指南  ||  培训文档  ||  常见问题解答  ||  发布记录  ||  发展路线  ||  社区 E ...

  9. sklearn的GridSearchCV例子

    class sklearn.model_selection.GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_j ...

  10. Koa 框架整理

    学习交流 Koa使用了ES6规范的generator和异步编程是一个更轻量级Web开发的框架,Koa 的先天优势在于 generator.由于是我个人的分享交流,所以Node基础.ES6标准.Web开 ...