使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。

▶ 源代码

  1. #include <cstdio>
  2. #include <vector>
  3. #include <algorithm>
  4. #include <cuda_runtime.h>
  5. #include "device_launch_parameters.h"
  6. #include <cublas_v2.h>
  7.  
  8. //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
  9. #ifdef USE_PTHREADS
  10. #include <pthread.h>
  11. #pragma comment(lib, "pthreadVC2.lib")
  12. #else
  13. #include <omp.h>
  14. #endif
  15.  
  16. // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
  17. #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  18. void srand48(long seed) { srand((unsigned int)seed); }
  19. double drand48() { return double(rand()) / RAND_MAX; }
  20. #endif
  21.  
  22. template <typename T> struct Task// struct 也可使用类的构造和析构函数
  23. {
  24. unsigned int size, id;
  25. T *data;
  26. T *result;
  27. T *vector;
  28.  
  29. Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};
  30. Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)
  31. {
  32. cudaMallocManaged(&data, sizeof(T)*size*size);
  33. cudaMallocManaged(&result, sizeof(T)*size);
  34. cudaMallocManaged(&vector, sizeof(T)*size);
  35. cudaDeviceSynchronize();
  36. }
  37.  
  38. ~Task()
  39. {
  40. cudaDeviceSynchronize();
  41. cudaFree(data);
  42. cudaFree(result);
  43. cudaFree(vector);
  44. }
  45.  
  46. void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组
  47. {
  48. id = unique_id;
  49. size = s;
  50. cudaMallocManaged(&data, sizeof(T)*size*size);
  51. cudaMallocManaged(&result, sizeof(T)*size);
  52. cudaMallocManaged(&vector, sizeof(T)*size);
  53. cudaDeviceSynchronize();
  54.  
  55. for (int i = ; i < size*size; i++)
  56. data[i] = drand48();
  57. for (int i = ; i < size; i++)
  58. {
  59. result[i] = .;
  60. vector[i] = drand48();
  61. }
  62. }
  63. };
  64.  
  65. #ifdef USE_PTHREADS// 封装 pthread 型的任务
  66. struct threadData_t
  67. {
  68. int tid;
  69. Task<double> *TaskListPtr;
  70. cudaStream_t *streams;
  71. cublasHandle_t *handles;
  72. int taskSize;
  73. };
  74.  
  75. typedef struct threadData_t threadData;
  76. #endif
  77.  
  78. template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
  79. {
  80. for (int i = ; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha
  81. {
  82. result[i] *= *beta;
  83. for (int j = ; j < n; j++)
  84. result[i] += *alpha * A[i*n + j] * x[j];
  85. }
  86. }
  87.  
  88. // execute a single task on either host or device depending on size
  89. #ifdef USE_PTHREADS
  90. void * execute(void* inpArgs)
  91. {
  92. threadData *dataPtr = (threadData *) inpArgs;
  93. cudaStream_t *stream = dataPtr->streams;
  94. cublasHandle_t *handle = dataPtr->handles;
  95. int tid = dataPtr->tid;
  96.  
  97. for (int i = ; i < dataPtr->taskSize; i++)
  98. {
  99. Task<double> &t = dataPtr->TaskListPtr[i];
  100. double alpha = 1.0;
  101. double beta = 0.0;
  102. if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
  103. {
  104. printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
  105. cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
  106. cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
  107. cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
  108. cudaStreamSynchronize(stream[]);
  109. gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
  110. }
  111. else
  112. {
  113. printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
  114. cublasSetStream(handle[tid+], stream[tid+]);
  115. cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
  116. cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
  117. cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
  118. cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
  119. }
  120. }
  121. return NULL;
  122. }
  123. #else
  124. template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
  125. {
  126. double alpha = 1.0;
  127. double beta = 0.0;
  128. if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
  129. {
  130. printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
  131. cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
  132. cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
  133. cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
  134. cudaStreamSynchronize(stream[]);
  135. gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
  136. }
  137. else
  138. {
  139. printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
  140. cublasSetStream(handle[tid+], stream[tid+]);
  141. cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
  142. cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
  143. cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
  144. cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
  145. }
  146. }
  147. #endif
  148.  
  149. template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
  150. {
  151. for (unsigned int i = ; i < TaskList.size(); i++)
  152. {
  153. int size;
  154. size = std::max((int)(drand48()*1000.0), );
  155. TaskList[i].allocate(size, i);
  156. }
  157. }
  158.  
  159. int main()
  160. {
  161. printf("\n\tStart.\n");
  162.  
  163. cudaDeviceProp device_prop;
  164. cudaGetDeviceProperties(&device_prop, );
  165. if (!device_prop.managedMemory)
  166. {
  167. printf("\n\tUnified Memory not supported\n");
  168. getchar();
  169. return ;
  170. }
  171. if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
  172. {
  173. printf("\n\tComputeMode is cudaComputeModeProhibited\n");
  174. getchar();
  175. return ;
  176. }
  177.  
  178. srand48(time(NULL));
  179. const int nthreads = ;
  180. cudaStream_t *streams = new cudaStream_t[nthreads+];
  181. cublasHandle_t *handles = new cublasHandle_t[nthreads+];
  182. for (int i=; i<nthreads+; i++)
  183. {
  184. cudaStreamCreate(&streams[i]);
  185. cublasCreate(&handles[i]);
  186. }
  187.  
  188. unsigned int N = ;
  189. std::vector<Task<double> > TaskList(N);
  190. initialise_tasks(TaskList);
  191. cudaSetDevice();
  192.  
  193. #ifdef USE_PTHREADS
  194. pthread_t threads[nthreads];
  195. threadData *InputToThreads = new threadData[nthreads];
  196. int temp = TaskList.size() / nthreads;
  197. for (int i=; i < nthreads; i++)
  198. {
  199. InputToThreads[i].tid = i;
  200. InputToThreads[i].streams = streams;
  201. InputToThreads[i].handles = handles;
  202.  
  203. if (temp == ) // 任务数量比线程数少
  204. {
  205. InputToThreads[i].taskSize = ;
  206. InputToThreads[i].TaskListPtr = &TaskList[];
  207. }
  208. else // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程
  209. {
  210. if (i == nthreads - )
  211. {
  212. InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);
  213. InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
  214. }
  215. else
  216. {
  217. InputToThreads[i].taskSize = temp;
  218. InputToThreads[i].TaskListPtr = &TaskList[i*temp];
  219. }
  220. }
  221. pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
  222. }
  223. for (int i=; i < nthreads; i++)
  224. pthread_join(threads[i], NULL);
  225. #else
  226. omp_set_num_threads(nthreads);
  227. #pragma omp parallel for schedule(dynamic)
  228. for (int i=; i<TaskList.size(); i++)
  229. {
  230. int tid = omp_get_thread_num();
  231. execute(TaskList[i], handles, streams, tid);
  232. }
  233. #endif
  234. cudaDeviceSynchronize();
  235.  
  236. // 清理工作
  237. for (int i=; i<nthreads+; i++)
  238. {
  239. cudaStreamDestroy(streams[i]);
  240. cublasDestroy(handles[i]);
  241. }
  242. std::vector< Task<double> >().swap(TaskList);
  243. printf("\n\tFinish.\n");
  244. getchar();
  245. return ;
  246. }

▶ 输出结果:OpenMP

  1. Start.
  2.  
  3. Task [ ], thread [ ], size[ ], on device
  4. Task [ ], thread [ ], size[ ], on device
  5. Task [ ], thread [ ], size[ ], on device
  6. Task [ ], thread [ ], size[ ], on device
  7. Task [ ], thread [ ], size[ ], on device
  8. Task [ ], thread [ ], size[ ], on device
  9. Task [ ], thread [ ], size[ ], on device
  10. Task [ ], thread [ ], size[ ], on device
  11. Task [ ], thread [ ], size[ ], on device
  12. Task [ ], thread [ ], size[ ], on device
  13. Task [], thread [ ], size[ ], on device
  14. Task [], thread [ ], size[ ], on device
  15. Task [], thread [ ], size[ ], on device
  16. Task [], thread [ ], size[ ], on device
  17. Task [], thread [ ], size[ ], on device
  18. Task [], thread [ ], size[ ], on device
  19. Task [], thread [ ], size[ ], on device
  20. Task [], thread [ ], size[ ], on device
  21. Task [], thread [ ], size[ ], on device
  22. Task [], thread [ ], size[ ], on device
  23. Task [], thread [ ], size[ ], on device
  24. Task [], thread [ ], size[ ], on device
  25. Task [], thread [ ], size[ ], on device
  26. Task [], thread [ ], size[ ], on device
  27. Task [], thread [ ], size[ ], on device
  28. Task [], thread [ ], size[ ], on device
  29. Task [], thread [ ], size[ ], on device
  30. Task [], thread [ ], size[ ], on device
  31. Task [], thread [ ], size[ ], on device
  32. Task [], thread [ ], size[ ], on device
  33. Task [], thread [ ], size[ ], on device
  34. Task [], thread [ ], size[ ], on device
  35. Task [], thread [ ], size[ ], on device
  36. Task [], thread [ ], size[ ], on device
  37. Task [], thread [ ], size[ ], on device
  38. Task [], thread [ ], size[ ], on device
  39. Task [], thread [ ], size[ ], on device
  40. Task [], thread [ ], size[ ], on device
  41. Task [], thread [ ], size[ ], on device
  42. Task [], thread [ ], size[ ], on device
  43. Finish.

▶ 输出结果:pthreads

  1. Start.
  2.  
  3. Task [ ], thread [ ], size [ ], on device
  4. Task [], thread [ ], size [ ], on device
  5. Task [], thread [ ], size [ ], on device
  6. Task [], thread [ ], size [ ], on device
  7. Task [], thread [ ], size [ ], on device
  8. Task [], thread [ ], size [ ], on device
  9. Task [ ], thread [ ], size [ ], on host
  10. Task [], thread [ ], size [ ], on device
  11. Task [], thread [ ], size [ ], on device
  12. Task [], thread [ ], size [ ], on device
  13. Task [], thread [ ], size [ ], on device
  14. Task [], thread [ ], size [ ], on device
  15. Task [ ], thread [ ], size [ ], on device
  16. Task [], thread [ ], size [ ], on device
  17. Task [], thread [ ], size [ ], on device
  18. Task [ ], thread [ ], size [ ], on device
  19. Task [], thread [ ], size [ ], on device
  20. Task [], thread [ ], size [ ], on device
  21. Task [], thread [ ], size [ ], on device
  22. Task [ ], thread [ ], size [ ], on device
  23. Task [], thread [ ], size [ ], on device
  24. Task [ ], thread [ ], size [ ], on device
  25. Task [], thread [ ], size [ ], on device
  26. Task [], thread [ ], size [ ], on device
  27. Task [], thread [ ], size [ ], on device
  28. Task [], thread [ ], size [ ], on device
  29. Task [], thread [ ], size [ ], on host
  30. Task [ ], thread [ ], size [ ], on host
  31. Task [], thread [ ], size [ ], on device
  32. Task [ ], thread [ ], size [ ], on device
  33. Task [], thread [ ], size [ ], on device
  34. Task [ ], thread [ ], size [ ], on device
  35. Task [], thread [ ], size [ ], on device
  36. Task [], thread [ ], size [ ], on device
  37. Task [], thread [ ], size [ ], on device
  38. Task [ ], thread [ ], size [ ], on host
  39. Task [], thread [ ], size [ ], on device
  40. Task [], thread [ ], size [ ], on device
  41. Task [], thread [ ], size [ ], on device
  42. Task [], thread [ ], size [ ], on device
  43. Finish.

▶ 涨姿势:

● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。

● 用到的申请内存的函数

  1. // driver_types.h
  2. #define cudaMemAttachGlobal 0x01 // 可访问内存
  3. #define cudaMemAttachHost 0x02 // 不可访问内存
  4. #define cudaMemAttachSingle 0x04 // 单线程可访问内存
  5.  
  6. // cuda_runtime.h
  7. template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)
  8. {
  9. return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
  10. }
  11.  
  12. // cuda_runtime_api.h
  13. extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));

0_Simple__UnifiedMemoryStreams的更多相关文章

随机推荐

  1. MySQL主从复制报错及解决方法

    mysql> show slave status \G *************************** 1. row *************************** Slave_ ...

  2. ElasticSearch(八):springboot集成ElasticSearch集群并使用

    1. 集群的搭建 见:ElasticSearch(七) 2. springboot配置集群 2.1 创建springboot项目,使用idea创建,不过多介绍(创建项目时候建议不要勾选elastics ...

  3. 前端jquery学习--03

    1.tab切换 <!DOCTYPE html> <html lang="en"> <head> <meta charset="U ...

  4. 1、ECMAScript 6 简介

    ECMAScript 和 JavaScript 的关系 ES6 与 ECMAScript 2015 的关系 语法提案的批准流程 ECMAScript 的历史 部署进度 Babel 转码器 Traceu ...

  5. SysRq魔法键的使用

    SysRq魔法键的使用 1.SysRq简介它能够在系统处于极端环境时响应按键并完成相应的处理.这在大多数时候有用.SysRq 经常被称为 Magic System Request,它被定义为一系列按键 ...

  6. vulcanjs 简单package 编写

    vulcanjs 功能是以包进行管理,包里面包含了运行依赖的组件以及对于路由的注册 参考项目 项目结构 ├── README.md ├── license.md ├── package-lock.js ...

  7. 转 JavaScript中判断对象类型的种种方法

    我们知道,JavaScript中检测对象类型的运算符有:typeof.instanceof,还有对象的constructor属性: 1) typeof 运算符 typeof 是一元运算符,返回结果是一 ...

  8. Apache2.4配置(全)

    http://blog.csdn.net/u012291157/article/details/46492137

  9. Centos7修改文件夹权限和用户名用户组

    Linux系统下经常遇到文件或者文件夹的权限问题,或者是因为文件夹所属的用户问题而没有访问的权限.根据我自己遇到的情况,对这类问题做一个小结.在命令行使用命令“ll”或者“ls -a”,可以查看文件或 ...

  10. 02.将uboot,kernel,rootfs下载到开发板上

    转载,侵删 将uboot,kernel,rootfs下载到开发板上 1.为什么要下载 所谓下载,也称烧录,部署. 1.1.什么是u-boot Hi3518EV200 单板的 Bootloader 采用 ...