使用 C++ 的模板

▶ 源代码:静态使用

 // sharedmem.cuh
#ifndef _SHAREDMEM_H_
#define _SHAREDMEM_H_ // SharedMemory 的封装
template <typename T> struct SharedMemory
{
__device__ T *getPointer()
{
extern __device__ void error(void);
error();
return NULL;
}
}; // SharedMemory 的各种数据类型的实现
template <> struct SharedMemory <int>
{
__device__ int *getPointer()
{
extern __shared__ int s_int[];
return s_int;
}
}; template <> struct SharedMemory <unsigned int>
{
__device__ unsigned int *getPointer()
{
extern __shared__ unsigned int s_uint[];
return s_uint;
}
}; template <> struct SharedMemory <char>
{
__device__ char *getPointer()
{
extern __shared__ char s_char[];
return s_char;
}
}; template <> struct SharedMemory <unsigned char>
{
__device__ unsigned char *getPointer()
{
extern __shared__ unsigned char s_uchar[];
return s_uchar;
}
}; template <> struct SharedMemory <short>
{
__device__ short *getPointer()
{
extern __shared__ short s_short[];
return s_short;
}
}; template <> struct SharedMemory <unsigned short>
{
__device__ unsigned short *getPointer()
{
extern __shared__ unsigned short s_ushort[];
return s_ushort;
}
}; template <> struct SharedMemory <long>
{
__device__ long *getPointer()
{
extern __shared__ long s_long[];
return s_long;
}
}; template <> struct SharedMemory <unsigned long>
{
__device__ unsigned long *getPointer()
{
extern __shared__ unsigned long s_ulong[];
return s_ulong;
}
}; template <> struct SharedMemory <bool>
{
__device__ bool *getPointer()
{
extern __shared__ bool s_bool[];
return s_bool;
}
}; template <> struct SharedMemory <float>
{
__device__ float *getPointer()
{
extern __shared__ float s_float[];
return s_float;
}
}; template <> struct SharedMemory <double>
{
__device__ double *getPointer()
{
extern __shared__ double s_double[];
return s_double;
}
}; #endif
 // simpleTemplates.cu
#include <stdio.h>
#include <timer.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h>
#include "sharedmem.cuh" template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
{
SharedMemory<T> smem;
T *sdata = smem.getPointer();
// 以上两行结合,等效于 extern __shared__ T sdata[];
const unsigned int tid = threadIdx.x; sdata[tid] = g_idata[tid];
__syncthreads();
sdata[tid] = (T) blockDim.x * sdata[tid];
__syncthreads();
g_odata[tid] = sdata[tid];
} template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
{
const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
for (unsigned int i = ; i < len; ++i)
reference[i] = idata[i] * T_len;
} // ArrayComparator 的封装
template<class T> class ArrayComparator
{
public:
bool compare(const T *reference, T *data, unsigned int len)
{
fprintf(stderr, "Error: no comparison function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
template<> class ArrayComparator<int>
{
public:
bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
}; template<> class ArrayComparator<float>
{
public:
bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
}; // ArrayFileWriter 的封装
template<class T> class ArrayFileWriter
{
public:
bool write(const char *filename, T *data, unsigned int len, float epsilon)
{
fprintf(stderr, "Error: no file write function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
template<> class ArrayFileWriter<int>
{
public:
bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<> class ArrayFileWriter<float>
{
public:
bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<class T> bool test(int len)
{
unsigned int mem_size = sizeof(T) * len;
dim3 grid(, , );
dim3 threads(len, , );
ArrayComparator<T> comparator;
ArrayFileWriter<T> writer;
cudaSetDevice();
StartTimer(); // 申请内存
T *h_idata, *h_odata, *d_idata, *d_odata;
h_idata = (T *)malloc(mem_size);
h_odata = (T *)malloc(mem_size);
cudaMalloc((void **)&d_idata, mem_size);
cudaMalloc((void **)&d_odata, mem_size);
for (unsigned int i = ; i < len; ++i)
h_idata[i] = (T) i;
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice); // 计算和计时
testKernel<T> << < grid, threads, mem_size >> > (d_idata, d_odata);
cudaMemcpy(h_odata, d_odata, sizeof(T) * len, cudaMemcpyDeviceToHost);
printf("\n\tProcessing time: %f ms\n", GetTimer()); // 检查结果
computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
bool result = comparator.compare(h_idata, h_odata, len);
//writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);// 写入文件的部分 free(h_idata);
free(h_odata);
cudaFree(d_idata);
cudaFree(d_odata);
return result;
} int main()
{
printf("\n\tStart.\n");
printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
printf("\n\t> test<float, 64>, result: %s.\n", test<float>() ? "Passed" : "Failed"); getchar();
return ;
}

▶ 输出结果:

    Start.

    Processing time: 107.394216 ms

    > test<float, >, result: Passed.

    Processing time: 3.153182 ms

    > test<float, >, result: Passed.

▶ 源代码:使用运行时编译

 // sharedmem.cuh,与静态完全相同
 // simpleTemplates_kernel.cu
#include "sharedmem.cuh" template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
{
SharedMemory<T> smem;
T *sdata = smem.getPointer();
// 以上两行结合,等效于 extern __shared__ T sdata[];
const unsigned int tid = threadIdx.x; sdata[tid] = g_idata[tid];
__syncthreads();
sdata[tid] = (T)blockDim.x * sdata[tid];
__syncthreads();
g_odata[tid] = sdata[tid];
} extern "C" __global__ void testFloat(float *p1, float *p2) { testKernel<float>(p1, p2); } extern "C" __global__ void testInt(int *p1, int *p2) { testKernel<int>(p1, p2); }
 // simpleTemplates.cpp
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <nvrtc_helper.h>
#include <timer.h> template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
{
const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
for (unsigned int i = ; i < len; ++i)
reference[i] = idata[i] * T_len;
} // ArrayComparator 的封装
template<class T> class ArrayComparator
{
public:
bool compare(const T *reference, T *data, unsigned int len)
{
fprintf(stderr, "Error: no comparison function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
template<> class ArrayComparator<int>
{
public:
bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
}; template<> class ArrayComparator<float>
{
public:
bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
}; // ArrayFileWriter 的封装
template<class T> class ArrayFileWriter
{
public:
bool write(const char *filename, T *data, unsigned int len, float epsilon)
{
fprintf(stderr, "Error: no file write function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
template<> class ArrayFileWriter<int>
{
public:
bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<> class ArrayFileWriter<float>
{
public:
bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; // getKernel 的模板
template <typename T> CUfunction getKernel(CUmodule in); template<> CUfunction getKernel<int>(CUmodule in)
{
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, in, "testInt");
return kernel_addr;
} template<> CUfunction getKernel<float>(CUmodule in)
{
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, in, "testFloat");
return kernel_addr;
} template<class T> bool test(int len)
{
// 与静态不同,编译 PTX
char *kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTemplates_nvrtc\\simpleTemplates_kernel.cu";
char *ptx;
size_t ptxSize;
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, ); // 1, NULL 分别为 argc 和 argv
CUmodule module = loadPTX(ptx, , NULL); // 1, NULL 分别为 argc 和 argv,有关于 GPU的输出 unsigned int mem_size = sizeof(T) * len;
dim3 grid(, , );
dim3 threads(len, , );
ArrayComparator<T> comparator;
ArrayFileWriter<T> writer;
StartTimer(); // 申请内存
T *h_idata, *h_odata;
CUdeviceptr d_idata, d_odata; // 与静态不同
h_idata = (T *)malloc(mem_size);
h_odata = (T *)malloc(mem_size);
cuMemAlloc(&d_idata, mem_size); // 与静态不同
cuMemAlloc(&d_odata, mem_size);
for (unsigned int i = ; i < len; ++i)
h_idata[i] = (T)i;
cuMemcpyHtoD(d_idata, h_idata, mem_size); // 与静态不同 // 计算和计时
CUfunction kernel_addr = getKernel<T>(module); void *arr[] = { (void *)&d_idata, (void *)&d_odata };
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, threads.x, threads.y, threads.z, mem_size, , &arr[], );
cuCtxSynchronize(); // 上下文同步
cuMemcpyDtoH(h_odata, d_odata, sizeof(T) * len);// 与静态不同
printf("\n\tProcessing time: %f ms\n", GetTimer()); // 检查结果
computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
bool result = comparator.compare(h_idata, h_odata, len);
//writer.write("./data/regression.dat", h_odata, len, 0.0f);// 写入文件的部分 free(h_idata);
free(h_odata);
cuMemFree(d_idata); // 与静态不同
cuMemFree(d_odata);
return result;
} int main()
{
printf("\n\tStart.\n");
printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
printf("\n\t> test<int, 64>, result: %s.\n", test<int>() ? "Passed" : "Failed"); getchar();
return ;
}

▶ 输出结果:

    Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Processing time: 0.699976 ms > test<float, >, result: Passed.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Processing time: 0.665355 ms > test<int, >, result: Passed.

▶ 涨姿势

● 封装了 SharedMemory,ArrayComparator,ArrayFileWriter 三个模板,并定义了其在不同的数据类型下的实现。

0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc的更多相关文章

随机推荐

  1. HDU 1234:开门人和关门人

    开门人和关门人 Time Limit: 2000/1000 MS (Java/Others)    Memory Limit: 65536/32768 K (Java/Others) Total Su ...

  2. freemarker在js中的应用

    <script type="text/javascript"> //freemarker在js中的应用: var newOrganizations = []; < ...

  3. Git Authoritative Guide 学习

    一.git命令1.git add -u : 将工作区中所有改动的文件添加到暂存区(修改.删除),但是不提交未被git跟踪的文件 -i : 可以进入交互界面选择性提交 -A : 相对于-u,它还提交新建 ...

  4. leetcode:Maximum Depth of Binary Tree【Python版】

    # Definition for a binary tree node # class TreeNode: # def __init__(self, x): # self.val = x # self ...

  5. adnanh webhook 框架request values 说明

      request values 在adnanh webhook 是比较重要的,规则触发以及命令参数传递都是通过它 支持的request values 类似 http header 查询参数 play ...

  6. C# 线程会合实例

    有这样一个题目:四个线程t1,t2,t3,t4,向4个文件中写入数据,要求:t1只能写入“1”,t2只能写入“2”,t3只能写入“3”,t4只能写入“4”,对4个文件A,B,C,D写入如下内容: A: ...

  7. 廖雪峰 ---- Python教程

    这是小白的Python新手教程,具有如下特点: 中文,免费,零起点,完整示例,基于最新的Python 3版本. Python是一种计算机程序设计语言.你可能已经听说过很多种流行的编程语言,比如非常难学 ...

  8. asp.net 退出登陆(解决退出后点击浏览器后退问题仍然可回到页面问题)

    代码如下: Session.Abandon(); Response.Redirect("Login.aspx"); 但是这样点点击浏览器的后退仍然可以回到刚才的页面,这可不行,在网 ...

  9. C#实现不安装Oracle客户端访问远程服务器数据

    概述: C#通过使用ADO的方式在未安装Oracle数据库的前提下,客户端程序远程访问服务器,会出现:“System.Data.OracleClient 需要 Oracle 客户端软件 8.1.7 或 ...

  10. Mysql 性能优化2 系统参数配置方法 和 文件系统

    --------------------------------------------目录------------------------------------------------- • 关于 ...