使用 C++ 的模板

▶ 源代码:静态使用

  1. // sharedmem.cuh
  2. #ifndef _SHAREDMEM_H_
  3. #define _SHAREDMEM_H_
  4.  
  5. // SharedMemory 的封装
  6. template <typename T> struct SharedMemory
  7. {
  8. __device__ T *getPointer()
  9. {
  10. extern __device__ void error(void);
  11. error();
  12. return NULL;
  13. }
  14. };
  15.  
  16. // SharedMemory 的各种数据类型的实现
  17. template <> struct SharedMemory <int>
  18. {
  19. __device__ int *getPointer()
  20. {
  21. extern __shared__ int s_int[];
  22. return s_int;
  23. }
  24. };
  25.  
  26. template <> struct SharedMemory <unsigned int>
  27. {
  28. __device__ unsigned int *getPointer()
  29. {
  30. extern __shared__ unsigned int s_uint[];
  31. return s_uint;
  32. }
  33. };
  34.  
  35. template <> struct SharedMemory <char>
  36. {
  37. __device__ char *getPointer()
  38. {
  39. extern __shared__ char s_char[];
  40. return s_char;
  41. }
  42. };
  43.  
  44. template <> struct SharedMemory <unsigned char>
  45. {
  46. __device__ unsigned char *getPointer()
  47. {
  48. extern __shared__ unsigned char s_uchar[];
  49. return s_uchar;
  50. }
  51. };
  52.  
  53. template <> struct SharedMemory <short>
  54. {
  55. __device__ short *getPointer()
  56. {
  57. extern __shared__ short s_short[];
  58. return s_short;
  59. }
  60. };
  61.  
  62. template <> struct SharedMemory <unsigned short>
  63. {
  64. __device__ unsigned short *getPointer()
  65. {
  66. extern __shared__ unsigned short s_ushort[];
  67. return s_ushort;
  68. }
  69. };
  70.  
  71. template <> struct SharedMemory <long>
  72. {
  73. __device__ long *getPointer()
  74. {
  75. extern __shared__ long s_long[];
  76. return s_long;
  77. }
  78. };
  79.  
  80. template <> struct SharedMemory <unsigned long>
  81. {
  82. __device__ unsigned long *getPointer()
  83. {
  84. extern __shared__ unsigned long s_ulong[];
  85. return s_ulong;
  86. }
  87. };
  88.  
  89. template <> struct SharedMemory <bool>
  90. {
  91. __device__ bool *getPointer()
  92. {
  93. extern __shared__ bool s_bool[];
  94. return s_bool;
  95. }
  96. };
  97.  
  98. template <> struct SharedMemory <float>
  99. {
  100. __device__ float *getPointer()
  101. {
  102. extern __shared__ float s_float[];
  103. return s_float;
  104. }
  105. };
  106.  
  107. template <> struct SharedMemory <double>
  108. {
  109. __device__ double *getPointer()
  110. {
  111. extern __shared__ double s_double[];
  112. return s_double;
  113. }
  114. };
  115.  
  116. #endif
  1. // simpleTemplates.cu
  2. #include <stdio.h>
  3. #include <timer.h>
  4. #include <cuda_runtime.h>
  5. #include "device_launch_parameters.h"
  6. #include <helper_functions.h>
  7. #include <helper_cuda.h>
  8. #include "sharedmem.cuh"
  9.  
  10. template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
  11. {
  12. SharedMemory<T> smem;
  13. T *sdata = smem.getPointer();
  14. // 以上两行结合,等效于 extern __shared__ T sdata[];
  15. const unsigned int tid = threadIdx.x;
  16.  
  17. sdata[tid] = g_idata[tid];
  18. __syncthreads();
  19. sdata[tid] = (T) blockDim.x * sdata[tid];
  20. __syncthreads();
  21. g_odata[tid] = sdata[tid];
  22. }
  23.  
  24. template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
  25. {
  26. const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
  27. for (unsigned int i = ; i < len; ++i)
  28. reference[i] = idata[i] * T_len;
  29. }
  30.  
  31. // ArrayComparator 的封装
  32. template<class T> class ArrayComparator
  33. {
  34. public:
  35. bool compare(const T *reference, T *data, unsigned int len)
  36. {
  37. fprintf(stderr, "Error: no comparison function implemented for this type\n");
  38. return false;
  39. }
  40. };
  41. // int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
  42. template<> class ArrayComparator<int>
  43. {
  44. public:
  45. bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
  46. };
  47.  
  48. template<> class ArrayComparator<float>
  49. {
  50. public:
  51. bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
  52. };
  53.  
  54. // ArrayFileWriter 的封装
  55. template<class T> class ArrayFileWriter
  56. {
  57. public:
  58. bool write(const char *filename, T *data, unsigned int len, float epsilon)
  59. {
  60. fprintf(stderr, "Error: no file write function implemented for this type\n");
  61. return false;
  62. }
  63. };
  64. // int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
  65. template<> class ArrayFileWriter<int>
  66. {
  67. public:
  68. bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
  69. };
  70.  
  71. template<> class ArrayFileWriter<float>
  72. {
  73. public:
  74. bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
  75. };
  76.  
  77. template<class T> bool test(int len)
  78. {
  79. unsigned int mem_size = sizeof(T) * len;
  80. dim3 grid(, , );
  81. dim3 threads(len, , );
  82. ArrayComparator<T> comparator;
  83. ArrayFileWriter<T> writer;
  84. cudaSetDevice();
  85. StartTimer();
  86.  
  87. // 申请内存
  88. T *h_idata, *h_odata, *d_idata, *d_odata;
  89. h_idata = (T *)malloc(mem_size);
  90. h_odata = (T *)malloc(mem_size);
  91. cudaMalloc((void **)&d_idata, mem_size);
  92. cudaMalloc((void **)&d_odata, mem_size);
  93. for (unsigned int i = ; i < len; ++i)
  94. h_idata[i] = (T) i;
  95. cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice);
  96.  
  97. // 计算和计时
  98. testKernel<T> << < grid, threads, mem_size >> > (d_idata, d_odata);
  99. cudaMemcpy(h_odata, d_odata, sizeof(T) * len, cudaMemcpyDeviceToHost);
  100. printf("\n\tProcessing time: %f ms\n", GetTimer());
  101.  
  102. // 检查结果
  103. computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
  104. bool result = comparator.compare(h_idata, h_odata, len);
  105. //writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);// 写入文件的部分
  106.  
  107. free(h_idata);
  108. free(h_odata);
  109. cudaFree(d_idata);
  110. cudaFree(d_odata);
  111. return result;
  112. }
  113.  
  114. int main()
  115. {
  116. printf("\n\tStart.\n");
  117. printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
  118. printf("\n\t> test<float, 64>, result: %s.\n", test<float>() ? "Passed" : "Failed");
  119.  
  120. getchar();
  121. return ;
  122. }

▶ 输出结果:

  1. Start.
  2.  
  3. Processing time: 107.394216 ms
  4.  
  5. > test<float, >, result: Passed.
  6.  
  7. Processing time: 3.153182 ms
  8.  
  9. > test<float, >, result: Passed.

▶ 源代码:使用运行时编译

  1. // sharedmem.cuh,与静态完全相同
  1. // simpleTemplates_kernel.cu
  2. #include "sharedmem.cuh"
  3.  
  4. template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
  5. {
  6. SharedMemory<T> smem;
  7. T *sdata = smem.getPointer();
  8. // 以上两行结合,等效于 extern __shared__ T sdata[];
  9. const unsigned int tid = threadIdx.x;
  10.  
  11. sdata[tid] = g_idata[tid];
  12. __syncthreads();
  13. sdata[tid] = (T)blockDim.x * sdata[tid];
  14. __syncthreads();
  15. g_odata[tid] = sdata[tid];
  16. }
  17.  
  18. extern "C" __global__ void testFloat(float *p1, float *p2) { testKernel<float>(p1, p2); }
  19.  
  20. extern "C" __global__ void testInt(int *p1, int *p2) { testKernel<int>(p1, p2); }
  1. // simpleTemplates.cpp
  2. #include <stdio.h>
  3. #include <cuda_runtime.h>
  4. #include "device_launch_parameters.h"
  5. #include <helper_functions.h>
  6. #include <nvrtc_helper.h>
  7. #include <timer.h>
  8.  
  9. template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
  10. {
  11. const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
  12. for (unsigned int i = ; i < len; ++i)
  13. reference[i] = idata[i] * T_len;
  14. }
  15.  
  16. // ArrayComparator 的封装
  17. template<class T> class ArrayComparator
  18. {
  19. public:
  20. bool compare(const T *reference, T *data, unsigned int len)
  21. {
  22. fprintf(stderr, "Error: no comparison function implemented for this type\n");
  23. return false;
  24. }
  25. };
  26. // int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
  27. template<> class ArrayComparator<int>
  28. {
  29. public:
  30. bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
  31. };
  32.  
  33. template<> class ArrayComparator<float>
  34. {
  35. public:
  36. bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
  37. };
  38.  
  39. // ArrayFileWriter 的封装
  40. template<class T> class ArrayFileWriter
  41. {
  42. public:
  43. bool write(const char *filename, T *data, unsigned int len, float epsilon)
  44. {
  45. fprintf(stderr, "Error: no file write function implemented for this type\n");
  46. return false;
  47. }
  48. };
  49. // int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
  50. template<> class ArrayFileWriter<int>
  51. {
  52. public:
  53. bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
  54. };
  55.  
  56. template<> class ArrayFileWriter<float>
  57. {
  58. public:
  59. bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
  60. };
  61.  
  62. // getKernel 的模板
  63. template <typename T> CUfunction getKernel(CUmodule in);
  64.  
  65. template<> CUfunction getKernel<int>(CUmodule in)
  66. {
  67. CUfunction kernel_addr;
  68. cuModuleGetFunction(&kernel_addr, in, "testInt");
  69. return kernel_addr;
  70. }
  71.  
  72. template<> CUfunction getKernel<float>(CUmodule in)
  73. {
  74. CUfunction kernel_addr;
  75. cuModuleGetFunction(&kernel_addr, in, "testFloat");
  76. return kernel_addr;
  77. }
  78.  
  79. template<class T> bool test(int len)
  80. {
  81. // 与静态不同,编译 PTX
  82. char *kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTemplates_nvrtc\\simpleTemplates_kernel.cu";
  83. char *ptx;
  84. size_t ptxSize;
  85. compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, ); // 1, NULL 分别为 argc 和 argv
  86. CUmodule module = loadPTX(ptx, , NULL); // 1, NULL 分别为 argc 和 argv,有关于 GPU的输出
  87.  
  88. unsigned int mem_size = sizeof(T) * len;
  89. dim3 grid(, , );
  90. dim3 threads(len, , );
  91. ArrayComparator<T> comparator;
  92. ArrayFileWriter<T> writer;
  93. StartTimer();
  94.  
  95. // 申请内存
  96. T *h_idata, *h_odata;
  97. CUdeviceptr d_idata, d_odata; // 与静态不同
  98. h_idata = (T *)malloc(mem_size);
  99. h_odata = (T *)malloc(mem_size);
  100. cuMemAlloc(&d_idata, mem_size); // 与静态不同
  101. cuMemAlloc(&d_odata, mem_size);
  102. for (unsigned int i = ; i < len; ++i)
  103. h_idata[i] = (T)i;
  104. cuMemcpyHtoD(d_idata, h_idata, mem_size); // 与静态不同
  105.  
  106. // 计算和计时
  107. CUfunction kernel_addr = getKernel<T>(module);
  108.  
  109. void *arr[] = { (void *)&d_idata, (void *)&d_odata };
  110. cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, threads.x, threads.y, threads.z, mem_size, , &arr[], );
  111. cuCtxSynchronize(); // 上下文同步
  112. cuMemcpyDtoH(h_odata, d_odata, sizeof(T) * len);// 与静态不同
  113. printf("\n\tProcessing time: %f ms\n", GetTimer());
  114.  
  115. // 检查结果
  116. computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
  117. bool result = comparator.compare(h_idata, h_odata, len);
  118. //writer.write("./data/regression.dat", h_odata, len, 0.0f);// 写入文件的部分
  119.  
  120. free(h_idata);
  121. free(h_odata);
  122. cuMemFree(d_idata); // 与静态不同
  123. cuMemFree(d_odata);
  124. return result;
  125. }
  126.  
  127. int main()
  128. {
  129. printf("\n\tStart.\n");
  130. printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
  131. printf("\n\t> test<int, 64>, result: %s.\n", test<int>() ? "Passed" : "Failed");
  132.  
  133. getchar();
  134. return ;
  135. }

▶ 输出结果:

  1. Start.
  2. > Using CUDA Device []: GeForce GTX
  3. > GPU Device has SM 6.1 compute capability
  4.  
  5. Processing time: 0.699976 ms
  6.  
  7. > test<float, >, result: Passed.
  8. > Using CUDA Device []: GeForce GTX
  9. > GPU Device has SM 6.1 compute capability
  10.  
  11. Processing time: 0.665355 ms
  12.  
  13. > test<int, >, result: Passed.

▶ 涨姿势

● 封装了 SharedMemory,ArrayComparator,ArrayFileWriter 三个模板,并定义了其在不同的数据类型下的实现。

0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc的更多相关文章

随机推荐

  1. 第二百七十六节,MySQL数据库,【显示、创建、选定、删除数据库】,【用户管理、对用户增删改查以及授权】

    MySQL数据库,[显示.创建.选定.删除数据库],[用户管理.对用户增删改查以及授权] 1.显示数据库 SHOW DATABASES;显示数据库 SHOW DATABASES; mysql - 用户 ...

  2. 有用的sql语句积累

    ⑴.  sql查询未被外键关联的数据 select * from bb b where not exists (select 1 from aa a where a.bid=b.bid)

  3. 自动AC机

    可以在lemon和cena环境下使用. #include<iostream> #include<cstdio> #include<cstring> #include ...

  4. Java JDK 版本的区别

    jdk6和jdk5相比的新特性有: 1.instrumentation 在 Java SE 6 里面,instrumentation 包被赋予了更强大的功能:启动后的 instrument.本地代码 ...

  5. 重磅来袭,开源Asp.Net MVC网上商城BrnShop正式发布,提供源码下载(转)

    BrnShop网上商城是以Asp.Net mvc3为基础开发的网上商城,源代码完全开源(企业版的源代码目前还没有完全整理完成,一旦整理完成也全部开源). 啥话也不说了,直接上源码:下载源码(由于公司服 ...

  6. day43 数据库知识欠缺的

    一 什么是存储引擎 mysql中建立的库===>文件夹 库中建立的表===>文件 现实生活中我们用来存储数据的文件有不同的类型,每种文件类型对应各自不同的处理机制:比如处理文本用txt类型 ...

  7. ActiveMQ集群方案

    集群方案主要为了解决系统架构中的两个关键问题:高可用和高性能.ActiveMQ服务的高可用性是指,在ActiveMQ服务性能不变.数据不丢失的前提下,确保当系统灾难出现时ActiveMQ能够持续提供消 ...

  8. [C#]画图全攻略(饼图与柱状图)(转)

    http://blog.chinaunix.net/uid-15481846-id-2769484.html 首先建立一个c#的类库.   打开vs.net,建立一个名为Insight_cs.WebC ...

  9. bzoj 4025 二分图——线段树分治+LCT

    题目:https://www.lydsy.com/JudgeOnline/problem.php?id=4025 线段树分治,用 LCT 维护链的长度即可.不过很慢. 正常(更快)的方法应该是线段树分 ...

  10. <<精通正在表达式>> 书评

     IT产业新技术日新月异,令人目不暇给,然而在这其中,真正称得上伟大东西的却寥寥无几.1998年,被誉为“软件世界的爱迪生”,发明了BSD. TCP/IP.csh.vi和NFS的SUN首席科学家Bil ...