0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc
使用 C++ 的模板
▶ 源代码:静态使用
- // sharedmem.cuh
- #ifndef _SHAREDMEM_H_
- #define _SHAREDMEM_H_
- // SharedMemory 的封装
- template <typename T> struct SharedMemory
- {
- __device__ T *getPointer()
- {
- extern __device__ void error(void);
- error();
- return NULL;
- }
- };
- // SharedMemory 的各种数据类型的实现
- template <> struct SharedMemory <int>
- {
- __device__ int *getPointer()
- {
- extern __shared__ int s_int[];
- return s_int;
- }
- };
- template <> struct SharedMemory <unsigned int>
- {
- __device__ unsigned int *getPointer()
- {
- extern __shared__ unsigned int s_uint[];
- return s_uint;
- }
- };
- template <> struct SharedMemory <char>
- {
- __device__ char *getPointer()
- {
- extern __shared__ char s_char[];
- return s_char;
- }
- };
- template <> struct SharedMemory <unsigned char>
- {
- __device__ unsigned char *getPointer()
- {
- extern __shared__ unsigned char s_uchar[];
- return s_uchar;
- }
- };
- template <> struct SharedMemory <short>
- {
- __device__ short *getPointer()
- {
- extern __shared__ short s_short[];
- return s_short;
- }
- };
- template <> struct SharedMemory <unsigned short>
- {
- __device__ unsigned short *getPointer()
- {
- extern __shared__ unsigned short s_ushort[];
- return s_ushort;
- }
- };
- template <> struct SharedMemory <long>
- {
- __device__ long *getPointer()
- {
- extern __shared__ long s_long[];
- return s_long;
- }
- };
- template <> struct SharedMemory <unsigned long>
- {
- __device__ unsigned long *getPointer()
- {
- extern __shared__ unsigned long s_ulong[];
- return s_ulong;
- }
- };
- template <> struct SharedMemory <bool>
- {
- __device__ bool *getPointer()
- {
- extern __shared__ bool s_bool[];
- return s_bool;
- }
- };
- template <> struct SharedMemory <float>
- {
- __device__ float *getPointer()
- {
- extern __shared__ float s_float[];
- return s_float;
- }
- };
- template <> struct SharedMemory <double>
- {
- __device__ double *getPointer()
- {
- extern __shared__ double s_double[];
- return s_double;
- }
- };
- #endif
- // simpleTemplates.cu
- #include <stdio.h>
- #include <timer.h>
- #include <cuda_runtime.h>
- #include "device_launch_parameters.h"
- #include <helper_functions.h>
- #include <helper_cuda.h>
- #include "sharedmem.cuh"
- template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
- {
- SharedMemory<T> smem;
- T *sdata = smem.getPointer();
- // 以上两行结合,等效于 extern __shared__ T sdata[];
- const unsigned int tid = threadIdx.x;
- sdata[tid] = g_idata[tid];
- __syncthreads();
- sdata[tid] = (T) blockDim.x * sdata[tid];
- __syncthreads();
- g_odata[tid] = sdata[tid];
- }
- template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
- {
- const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
- for (unsigned int i = ; i < len; ++i)
- reference[i] = idata[i] * T_len;
- }
- // ArrayComparator 的封装
- template<class T> class ArrayComparator
- {
- public:
- bool compare(const T *reference, T *data, unsigned int len)
- {
- fprintf(stderr, "Error: no comparison function implemented for this type\n");
- return false;
- }
- };
- // int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
- template<> class ArrayComparator<int>
- {
- public:
- bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
- };
- template<> class ArrayComparator<float>
- {
- public:
- bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
- };
- // ArrayFileWriter 的封装
- template<class T> class ArrayFileWriter
- {
- public:
- bool write(const char *filename, T *data, unsigned int len, float epsilon)
- {
- fprintf(stderr, "Error: no file write function implemented for this type\n");
- return false;
- }
- };
- // int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
- template<> class ArrayFileWriter<int>
- {
- public:
- bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
- };
- template<> class ArrayFileWriter<float>
- {
- public:
- bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
- };
- template<class T> bool test(int len)
- {
- unsigned int mem_size = sizeof(T) * len;
- dim3 grid(, , );
- dim3 threads(len, , );
- ArrayComparator<T> comparator;
- ArrayFileWriter<T> writer;
- cudaSetDevice();
- StartTimer();
- // 申请内存
- T *h_idata, *h_odata, *d_idata, *d_odata;
- h_idata = (T *)malloc(mem_size);
- h_odata = (T *)malloc(mem_size);
- cudaMalloc((void **)&d_idata, mem_size);
- cudaMalloc((void **)&d_odata, mem_size);
- for (unsigned int i = ; i < len; ++i)
- h_idata[i] = (T) i;
- cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice);
- // 计算和计时
- testKernel<T> << < grid, threads, mem_size >> > (d_idata, d_odata);
- cudaMemcpy(h_odata, d_odata, sizeof(T) * len, cudaMemcpyDeviceToHost);
- printf("\n\tProcessing time: %f ms\n", GetTimer());
- // 检查结果
- computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
- bool result = comparator.compare(h_idata, h_odata, len);
- //writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);// 写入文件的部分
- free(h_idata);
- free(h_odata);
- cudaFree(d_idata);
- cudaFree(d_odata);
- return result;
- }
- int main()
- {
- printf("\n\tStart.\n");
- printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
- printf("\n\t> test<float, 64>, result: %s.\n", test<float>() ? "Passed" : "Failed");
- getchar();
- return ;
- }
▶ 输出结果:
- Start.
- Processing time: 107.394216 ms
- > test<float, >, result: Passed.
- Processing time: 3.153182 ms
- > test<float, >, result: Passed.
▶ 源代码:使用运行时编译
- // sharedmem.cuh,与静态完全相同
- // simpleTemplates_kernel.cu
- #include "sharedmem.cuh"
- template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
- {
- SharedMemory<T> smem;
- T *sdata = smem.getPointer();
- // 以上两行结合,等效于 extern __shared__ T sdata[];
- const unsigned int tid = threadIdx.x;
- sdata[tid] = g_idata[tid];
- __syncthreads();
- sdata[tid] = (T)blockDim.x * sdata[tid];
- __syncthreads();
- g_odata[tid] = sdata[tid];
- }
- extern "C" __global__ void testFloat(float *p1, float *p2) { testKernel<float>(p1, p2); }
- extern "C" __global__ void testInt(int *p1, int *p2) { testKernel<int>(p1, p2); }
- // simpleTemplates.cpp
- #include <stdio.h>
- #include <cuda_runtime.h>
- #include "device_launch_parameters.h"
- #include <helper_functions.h>
- #include <nvrtc_helper.h>
- #include <timer.h>
- template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
- {
- const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
- for (unsigned int i = ; i < len; ++i)
- reference[i] = idata[i] * T_len;
- }
- // ArrayComparator 的封装
- template<class T> class ArrayComparator
- {
- public:
- bool compare(const T *reference, T *data, unsigned int len)
- {
- fprintf(stderr, "Error: no comparison function implemented for this type\n");
- return false;
- }
- };
- // int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
- template<> class ArrayComparator<int>
- {
- public:
- bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
- };
- template<> class ArrayComparator<float>
- {
- public:
- bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
- };
- // ArrayFileWriter 的封装
- template<class T> class ArrayFileWriter
- {
- public:
- bool write(const char *filename, T *data, unsigned int len, float epsilon)
- {
- fprintf(stderr, "Error: no file write function implemented for this type\n");
- return false;
- }
- };
- // int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
- template<> class ArrayFileWriter<int>
- {
- public:
- bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
- };
- template<> class ArrayFileWriter<float>
- {
- public:
- bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
- };
- // getKernel 的模板
- template <typename T> CUfunction getKernel(CUmodule in);
- template<> CUfunction getKernel<int>(CUmodule in)
- {
- CUfunction kernel_addr;
- cuModuleGetFunction(&kernel_addr, in, "testInt");
- return kernel_addr;
- }
- template<> CUfunction getKernel<float>(CUmodule in)
- {
- CUfunction kernel_addr;
- cuModuleGetFunction(&kernel_addr, in, "testFloat");
- return kernel_addr;
- }
- template<class T> bool test(int len)
- {
- // 与静态不同,编译 PTX
- char *kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTemplates_nvrtc\\simpleTemplates_kernel.cu";
- char *ptx;
- size_t ptxSize;
- compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, ); // 1, NULL 分别为 argc 和 argv
- CUmodule module = loadPTX(ptx, , NULL); // 1, NULL 分别为 argc 和 argv,有关于 GPU的输出
- unsigned int mem_size = sizeof(T) * len;
- dim3 grid(, , );
- dim3 threads(len, , );
- ArrayComparator<T> comparator;
- ArrayFileWriter<T> writer;
- StartTimer();
- // 申请内存
- T *h_idata, *h_odata;
- CUdeviceptr d_idata, d_odata; // 与静态不同
- h_idata = (T *)malloc(mem_size);
- h_odata = (T *)malloc(mem_size);
- cuMemAlloc(&d_idata, mem_size); // 与静态不同
- cuMemAlloc(&d_odata, mem_size);
- for (unsigned int i = ; i < len; ++i)
- h_idata[i] = (T)i;
- cuMemcpyHtoD(d_idata, h_idata, mem_size); // 与静态不同
- // 计算和计时
- CUfunction kernel_addr = getKernel<T>(module);
- void *arr[] = { (void *)&d_idata, (void *)&d_odata };
- cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, threads.x, threads.y, threads.z, mem_size, , &arr[], );
- cuCtxSynchronize(); // 上下文同步
- cuMemcpyDtoH(h_odata, d_odata, sizeof(T) * len);// 与静态不同
- printf("\n\tProcessing time: %f ms\n", GetTimer());
- // 检查结果
- computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
- bool result = comparator.compare(h_idata, h_odata, len);
- //writer.write("./data/regression.dat", h_odata, len, 0.0f);// 写入文件的部分
- free(h_idata);
- free(h_odata);
- cuMemFree(d_idata); // 与静态不同
- cuMemFree(d_odata);
- return result;
- }
- int main()
- {
- printf("\n\tStart.\n");
- printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
- printf("\n\t> test<int, 64>, result: %s.\n", test<int>() ? "Passed" : "Failed");
- getchar();
- return ;
- }
▶ 输出结果:
- Start.
- > Using CUDA Device []: GeForce GTX
- > GPU Device has SM 6.1 compute capability
- Processing time: 0.699976 ms
- > test<float, >, result: Passed.
- > Using CUDA Device []: GeForce GTX
- > GPU Device has SM 6.1 compute capability
- Processing time: 0.665355 ms
- > test<int, >, result: Passed.
▶ 涨姿势
● 封装了 SharedMemory,ArrayComparator,ArrayFileWriter 三个模板,并定义了其在不同的数据类型下的实现。
0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc的更多相关文章
随机推荐
- 第二百七十六节,MySQL数据库,【显示、创建、选定、删除数据库】,【用户管理、对用户增删改查以及授权】
MySQL数据库,[显示.创建.选定.删除数据库],[用户管理.对用户增删改查以及授权] 1.显示数据库 SHOW DATABASES;显示数据库 SHOW DATABASES; mysql - 用户 ...
- 有用的sql语句积累
⑴. sql查询未被外键关联的数据 select * from bb b where not exists (select 1 from aa a where a.bid=b.bid)
- 自动AC机
可以在lemon和cena环境下使用. #include<iostream> #include<cstdio> #include<cstring> #include ...
- Java JDK 版本的区别
jdk6和jdk5相比的新特性有: 1.instrumentation 在 Java SE 6 里面,instrumentation 包被赋予了更强大的功能:启动后的 instrument.本地代码 ...
- 重磅来袭,开源Asp.Net MVC网上商城BrnShop正式发布,提供源码下载(转)
BrnShop网上商城是以Asp.Net mvc3为基础开发的网上商城,源代码完全开源(企业版的源代码目前还没有完全整理完成,一旦整理完成也全部开源). 啥话也不说了,直接上源码:下载源码(由于公司服 ...
- day43 数据库知识欠缺的
一 什么是存储引擎 mysql中建立的库===>文件夹 库中建立的表===>文件 现实生活中我们用来存储数据的文件有不同的类型,每种文件类型对应各自不同的处理机制:比如处理文本用txt类型 ...
- ActiveMQ集群方案
集群方案主要为了解决系统架构中的两个关键问题:高可用和高性能.ActiveMQ服务的高可用性是指,在ActiveMQ服务性能不变.数据不丢失的前提下,确保当系统灾难出现时ActiveMQ能够持续提供消 ...
- [C#]画图全攻略(饼图与柱状图)(转)
http://blog.chinaunix.net/uid-15481846-id-2769484.html 首先建立一个c#的类库. 打开vs.net,建立一个名为Insight_cs.WebC ...
- bzoj 4025 二分图——线段树分治+LCT
题目:https://www.lydsy.com/JudgeOnline/problem.php?id=4025 线段树分治,用 LCT 维护链的长度即可.不过很慢. 正常(更快)的方法应该是线段树分 ...
- <<精通正在表达式>> 书评
IT产业新技术日新月异,令人目不暇给,然而在这其中,真正称得上伟大东西的却寥寥无几.1998年,被誉为“软件世界的爱迪生”,发明了BSD. TCP/IP.csh.vi和NFS的SUN首席科学家Bil ...