0_Simple__UnifiedMemoryStreams
使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。
▶ 源代码
- #include <cstdio>
- #include <vector>
- #include <algorithm>
- #include <cuda_runtime.h>
- #include "device_launch_parameters.h"
- #include <cublas_v2.h>
- //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
- #ifdef USE_PTHREADS
- #include <pthread.h>
- #pragma comment(lib, "pthreadVC2.lib")
- #else
- #include <omp.h>
- #endif
- // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
- #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
- void srand48(long seed) { srand((unsigned int)seed); }
- double drand48() { return double(rand()) / RAND_MAX; }
- #endif
- template <typename T> struct Task// struct 也可使用类的构造和析构函数
- {
- unsigned int size, id;
- T *data;
- T *result;
- T *vector;
- Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};
- Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)
- {
- cudaMallocManaged(&data, sizeof(T)*size*size);
- cudaMallocManaged(&result, sizeof(T)*size);
- cudaMallocManaged(&vector, sizeof(T)*size);
- cudaDeviceSynchronize();
- }
- ~Task()
- {
- cudaDeviceSynchronize();
- cudaFree(data);
- cudaFree(result);
- cudaFree(vector);
- }
- void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组
- {
- id = unique_id;
- size = s;
- cudaMallocManaged(&data, sizeof(T)*size*size);
- cudaMallocManaged(&result, sizeof(T)*size);
- cudaMallocManaged(&vector, sizeof(T)*size);
- cudaDeviceSynchronize();
- for (int i = ; i < size*size; i++)
- data[i] = drand48();
- for (int i = ; i < size; i++)
- {
- result[i] = .;
- vector[i] = drand48();
- }
- }
- };
- #ifdef USE_PTHREADS// 封装 pthread 型的任务
- struct threadData_t
- {
- int tid;
- Task<double> *TaskListPtr;
- cudaStream_t *streams;
- cublasHandle_t *handles;
- int taskSize;
- };
- typedef struct threadData_t threadData;
- #endif
- template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
- {
- for (int i = ; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha
- {
- result[i] *= *beta;
- for (int j = ; j < n; j++)
- result[i] += *alpha * A[i*n + j] * x[j];
- }
- }
- // execute a single task on either host or device depending on size
- #ifdef USE_PTHREADS
- void * execute(void* inpArgs)
- {
- threadData *dataPtr = (threadData *) inpArgs;
- cudaStream_t *stream = dataPtr->streams;
- cublasHandle_t *handle = dataPtr->handles;
- int tid = dataPtr->tid;
- for (int i = ; i < dataPtr->taskSize; i++)
- {
- Task<double> &t = dataPtr->TaskListPtr[i];
- double alpha = 1.0;
- double beta = 0.0;
- if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
- {
- printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
- cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
- cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
- cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
- cudaStreamSynchronize(stream[]);
- gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
- }
- else
- {
- printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
- cublasSetStream(handle[tid+], stream[tid+]);
- cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
- cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
- cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
- cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
- }
- }
- return NULL;
- }
- #else
- template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
- {
- double alpha = 1.0;
- double beta = 0.0;
- if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
- {
- printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
- cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
- cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
- cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
- cudaStreamSynchronize(stream[]);
- gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
- }
- else
- {
- printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
- cublasSetStream(handle[tid+], stream[tid+]);
- cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
- cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
- cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
- cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
- }
- }
- #endif
- template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
- {
- for (unsigned int i = ; i < TaskList.size(); i++)
- {
- int size;
- size = std::max((int)(drand48()*1000.0), );
- TaskList[i].allocate(size, i);
- }
- }
- int main()
- {
- printf("\n\tStart.\n");
- cudaDeviceProp device_prop;
- cudaGetDeviceProperties(&device_prop, );
- if (!device_prop.managedMemory)
- {
- printf("\n\tUnified Memory not supported\n");
- getchar();
- return ;
- }
- if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
- {
- printf("\n\tComputeMode is cudaComputeModeProhibited\n");
- getchar();
- return ;
- }
- srand48(time(NULL));
- const int nthreads = ;
- cudaStream_t *streams = new cudaStream_t[nthreads+];
- cublasHandle_t *handles = new cublasHandle_t[nthreads+];
- for (int i=; i<nthreads+; i++)
- {
- cudaStreamCreate(&streams[i]);
- cublasCreate(&handles[i]);
- }
- unsigned int N = ;
- std::vector<Task<double> > TaskList(N);
- initialise_tasks(TaskList);
- cudaSetDevice();
- #ifdef USE_PTHREADS
- pthread_t threads[nthreads];
- threadData *InputToThreads = new threadData[nthreads];
- int temp = TaskList.size() / nthreads;
- for (int i=; i < nthreads; i++)
- {
- InputToThreads[i].tid = i;
- InputToThreads[i].streams = streams;
- InputToThreads[i].handles = handles;
- if (temp == ) // 任务数量比线程数少
- {
- InputToThreads[i].taskSize = ;
- InputToThreads[i].TaskListPtr = &TaskList[];
- }
- else // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程
- {
- if (i == nthreads - )
- {
- InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);
- InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
- }
- else
- {
- InputToThreads[i].taskSize = temp;
- InputToThreads[i].TaskListPtr = &TaskList[i*temp];
- }
- }
- pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
- }
- for (int i=; i < nthreads; i++)
- pthread_join(threads[i], NULL);
- #else
- omp_set_num_threads(nthreads);
- #pragma omp parallel for schedule(dynamic)
- for (int i=; i<TaskList.size(); i++)
- {
- int tid = omp_get_thread_num();
- execute(TaskList[i], handles, streams, tid);
- }
- #endif
- cudaDeviceSynchronize();
- // 清理工作
- for (int i=; i<nthreads+; i++)
- {
- cudaStreamDestroy(streams[i]);
- cublasDestroy(handles[i]);
- }
- std::vector< Task<double> >().swap(TaskList);
- printf("\n\tFinish.\n");
- getchar();
- return ;
- }
▶ 输出结果:OpenMP
- Start.
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [ ], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Task [], thread [ ], size[ ], on device
- Finish.
▶ 输出结果:pthreads
- Start.
- Task [ ], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [ ], thread [ ], size [ ], on host
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [ ], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [ ], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [ ], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [ ], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on host
- Task [ ], thread [ ], size [ ], on host
- Task [], thread [ ], size [ ], on device
- Task [ ], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [ ], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [ ], thread [ ], size [ ], on host
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Task [], thread [ ], size [ ], on device
- Finish.
▶ 涨姿势:
● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。
● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。
● 用到的申请内存的函数
- // driver_types.h
- #define cudaMemAttachGlobal 0x01 // 可访问内存
- #define cudaMemAttachHost 0x02 // 不可访问内存
- #define cudaMemAttachSingle 0x04 // 单线程可访问内存
- // cuda_runtime.h
- template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)
- {
- return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
- }
- // cuda_runtime_api.h
- extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));
0_Simple__UnifiedMemoryStreams的更多相关文章
随机推荐
- MySQL主从复制报错及解决方法
mysql> show slave status \G *************************** 1. row *************************** Slave_ ...
- ElasticSearch(八):springboot集成ElasticSearch集群并使用
1. 集群的搭建 见:ElasticSearch(七) 2. springboot配置集群 2.1 创建springboot项目,使用idea创建,不过多介绍(创建项目时候建议不要勾选elastics ...
- 前端jquery学习--03
1.tab切换 <!DOCTYPE html> <html lang="en"> <head> <meta charset="U ...
- 1、ECMAScript 6 简介
ECMAScript 和 JavaScript 的关系 ES6 与 ECMAScript 2015 的关系 语法提案的批准流程 ECMAScript 的历史 部署进度 Babel 转码器 Traceu ...
- SysRq魔法键的使用
SysRq魔法键的使用 1.SysRq简介它能够在系统处于极端环境时响应按键并完成相应的处理.这在大多数时候有用.SysRq 经常被称为 Magic System Request,它被定义为一系列按键 ...
- vulcanjs 简单package 编写
vulcanjs 功能是以包进行管理,包里面包含了运行依赖的组件以及对于路由的注册 参考项目 项目结构 ├── README.md ├── license.md ├── package-lock.js ...
- 转 JavaScript中判断对象类型的种种方法
我们知道,JavaScript中检测对象类型的运算符有:typeof.instanceof,还有对象的constructor属性: 1) typeof 运算符 typeof 是一元运算符,返回结果是一 ...
- Apache2.4配置(全)
http://blog.csdn.net/u012291157/article/details/46492137
- Centos7修改文件夹权限和用户名用户组
Linux系统下经常遇到文件或者文件夹的权限问题,或者是因为文件夹所属的用户问题而没有访问的权限.根据我自己遇到的情况,对这类问题做一个小结.在命令行使用命令“ll”或者“ls -a”,可以查看文件或 ...
- 02.将uboot,kernel,rootfs下载到开发板上
转载,侵删 将uboot,kernel,rootfs下载到开发板上 1.为什么要下载 所谓下载,也称烧录,部署. 1.1.什么是u-boot Hi3518EV200 单板的 Bootloader 采用 ...