使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。

▶ 源代码

 #include <cstdio>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h> //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
#ifdef USE_PTHREADS
#include <pthread.h>
#pragma comment(lib, "pthreadVC2.lib")
#else
#include <omp.h>
#endif // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; }
#endif template <typename T> struct Task// struct 也可使用类的构造和析构函数
{
unsigned int size, id;
T *data;
T *result;
T *vector; Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};
Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)
{
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize();
} ~Task()
{
cudaDeviceSynchronize();
cudaFree(data);
cudaFree(result);
cudaFree(vector);
} void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组
{
id = unique_id;
size = s;
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize(); for (int i = ; i < size*size; i++)
data[i] = drand48();
for (int i = ; i < size; i++)
{
result[i] = .;
vector[i] = drand48();
}
}
}; #ifdef USE_PTHREADS// 封装 pthread 型的任务
struct threadData_t
{
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
}; typedef struct threadData_t threadData;
#endif template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
{
for (int i = ; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha
{
result[i] *= *beta;
for (int j = ; j < n; j++)
result[i] += *alpha * A[i*n + j] * x[j];
}
} // execute a single task on either host or device depending on size
#ifdef USE_PTHREADS
void * execute(void* inpArgs)
{
threadData *dataPtr = (threadData *) inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid; for (int i = ; i < dataPtr->taskSize; i++)
{
Task<double> &t = dataPtr->TaskListPtr[i];
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
return NULL;
}
#else
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
{
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
#endif template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
{
for (unsigned int i = ; i < TaskList.size(); i++)
{
int size;
size = std::max((int)(drand48()*1000.0), );
TaskList[i].allocate(size, i);
}
} int main()
{
printf("\n\tStart.\n"); cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, );
if (!device_prop.managedMemory)
{
printf("\n\tUnified Memory not supported\n");
getchar();
return ;
}
if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
{
printf("\n\tComputeMode is cudaComputeModeProhibited\n");
getchar();
return ;
} srand48(time(NULL));
const int nthreads = ;
cudaStream_t *streams = new cudaStream_t[nthreads+];
cublasHandle_t *handles = new cublasHandle_t[nthreads+];
for (int i=; i<nthreads+; i++)
{
cudaStreamCreate(&streams[i]);
cublasCreate(&handles[i]);
} unsigned int N = ;
std::vector<Task<double> > TaskList(N);
initialise_tasks(TaskList);
cudaSetDevice(); #ifdef USE_PTHREADS
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
int temp = TaskList.size() / nthreads;
for (int i=; i < nthreads; i++)
{
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles; if (temp == ) // 任务数量比线程数少
{
InputToThreads[i].taskSize = ;
InputToThreads[i].TaskListPtr = &TaskList[];
}
else // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程
{
if (i == nthreads - )
{
InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
}
else
{
InputToThreads[i].taskSize = temp;
InputToThreads[i].TaskListPtr = &TaskList[i*temp];
}
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i=; i < nthreads; i++)
pthread_join(threads[i], NULL);
#else
omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic)
for (int i=; i<TaskList.size(); i++)
{
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
#endif
cudaDeviceSynchronize(); // 清理工作
for (int i=; i<nthreads+; i++)
{
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
std::vector< Task<double> >().swap(TaskList);
printf("\n\tFinish.\n");
getchar();
return ;
}

▶ 输出结果:OpenMP

    Start.

Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Finish.

▶ 输出结果:pthreads

    Start.

Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on host
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Finish.

▶ 涨姿势:

● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。

● 用到的申请内存的函数

 // driver_types.h
#define cudaMemAttachGlobal 0x01 // 可访问内存
#define cudaMemAttachHost 0x02 // 不可访问内存
#define cudaMemAttachSingle 0x04 // 单线程可访问内存 // cuda_runtime.h
template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)
{
return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
} // cuda_runtime_api.h
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));

0_Simple__UnifiedMemoryStreams的更多相关文章

随机推荐

  1. ZZNU 2125:A + B 普拉斯(傻逼题+大数加法)

    2125: A + B 普拉斯 时间限制: 1 Sec  内存限制: 128 MB 提交: 94  解决: 28 [提交] [状态] [讨论版] [命题人:admin] 题目描述 "别人总说 ...

  2. HDU 4651 数论 partition 求自然数的拆分数

    别人的解题报告: http://blog.csdn.net/zstu_zlj/article/details/9796087 我的代码: #include <cstdio> #define ...

  3. JQuery实现高级检索功能

    https://blog.csdn.net/muziruoyi/article/details/44494465 < div id= "0" class ="row ...

  4. 【转】python3中bytes和string之间的互相转换

    问题: 比对算法测试脚本在python2.7上跑的没问题,在python3上报错,将base64转码之后的串打印出来发现,2.7版本和3是不一样的:2.7就是字符串类型的,但是3是bytes类型的,形 ...

  5. Git Authoritative Guide 学习

    一.git命令1.git add -u : 将工作区中所有改动的文件添加到暂存区(修改.删除),但是不提交未被git跟踪的文件 -i : 可以进入交互界面选择性提交 -A : 相对于-u,它还提交新建 ...

  6. 虚拟机中的CentOS 7设置固定IP连接最理想的配置

    说明:在网上搜了好多文章都是大同小异,都没有完全解决我想要的固定IP后要达到的如下效果, 1.笔记本主机IP为设置自动获取,不管什么情况下,不受虚拟机影响,只要连接外网就可以正常上网: 2.只要笔记本 ...

  7. SQL——ROW_NUMBER

    版权声明:欢迎转载,请注明出处 https://blog.csdn.net/suneqing/article/details/30250193 语法: ROW_NUMBER() OVER(PARTIT ...

  8. Selenium2+python自动化40-cookie相关操作

    前言 虽然cookie相关操作在平常ui自动化中用得少,偶尔也会用到,比如登录有图形验证码,可以通过绕过验证码方式,添加cookie方法登录. 登录后换账号登录时候,也可作为后置条件去删除cookie ...

  9. hadoop家族学习路线图之hadoop产品大全

    大数据这个词也许几年前你听着还会觉得陌生,但我相信你现在听到hadoop这个词的时候你应该都会觉得“熟悉”!越来越发现身边从事hadoop开发或者是正在学习hadoop的人变多了.作为一个hadoop ...

  10. SQL Server存储过程 对数组参数的循环处理

    方法一 分割 例:通过SQL Server存储过程传送数组参数删除多条记录 eg. ID 值为'1,2,3' 以下存储过程就是删除表中id号为1,2,3的记录: CREATE PROCEDURE De ...