0_Simple__matrixMulCUBLAS
使用CUDA的线性代数库cuBLAS来计算矩阵乘法。这里主要记录调用规则,关于乘法函数中详细的参数说明和调用规则见另一篇随笔。
▶ 源代码:
#include <assert.h>
#include <helper_string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_functions.h>
#include <helper_cuda.h> #ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif // 存放各矩阵维数的结构体
typedef struct _matrixSize
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize; // CPU 计算矩阵乘法。三个参数分别用于行定位、行程定位、列定位,没有查错机制。
void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = ; i < hA; ++i) // 从上往下数 i 行
{
for (unsigned int j = ; j < wB; ++j) // 从左往右数 j 列
{
double sum = ;
for (unsigned int k = ; k < wA; ++k) // 行程长
{
double a = A[i * wA + k]; // 中间过程用 double,结果输出 float
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
} // 初始化数组
void randomInit(float *data, int size)
{
for (int i = ; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
} //输出两个矩阵的不相等的值及其位置,允许容差为 fListTol ,最多输出 iListLength 个
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
int i, j, k;
int error_count = ; for (j = ; j < height; j++)
{
if (error_count < iListLength)
printf("\n Row %d:\n", j); for (i = ; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
error_count++;
}
}
}
printf(" \n Total Errors = %d\n", error_count);
} // 初始化设备。包括选择设备,设定维数
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
cudaError_t error; // 选择设备,略去了检查错误部分
devID = ;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
}
error = cudaGetDevice(&devID);
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
iSizeMultiple = max(min(iSizeMultiple, ), );
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // Fermi 以上构架的计算机使用更大的线程块(blockDim),这里用了32
int block_size = (deviceProp.major < ) ? : ; matrix_size.uiWA = * block_size * iSizeMultiple;
matrix_size.uiHA = * block_size * iSizeMultiple;
matrix_size.uiWB = * block_size * iSizeMultiple;
matrix_size.uiHB = * block_size * iSizeMultiple;
matrix_size.uiWC = * block_size * iSizeMultiple;
matrix_size.uiHC = * block_size * iSizeMultiple; printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC, matrix_size.uiWC);
if (matrix_size.uiWA != matrix_size.uiHB || matrix_size.uiHA != matrix_size.uiHC || matrix_size.uiWB != matrix_size.uiWC)
{
printf("ERROR: Matrix sizes do not match!\n");
exit(-);
}
} // 计算矩阵乘法部分
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, devID);
int block_size = (deviceProp.major < ) ? : ; unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B); srand();
randomInit(h_A, size_A);
randomInit(h_B, size_B); float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
//float *h_C = (float *) malloc(mem_size_C); // TM 没有用!
float *h_CUBLAS = (float *) malloc(mem_size_C); // 保存 d_C 回传的结果
cudaMalloc((void **) &d_A, mem_size_A);
cudaMalloc((void **) &d_B, mem_size_B);
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_C, mem_size_C); dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); printf("Computing result using CUBLAS...");
int nIter = ; //cuBLAS代码块
{
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaEvent_t start, stop; cublasCreate(&handle); //热身,注意转置的问题,不采用 <<< >>> 调用核函数
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB); cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord(start, NULL); for (int j = ; j < nIter; j++)
{
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
}
printf("done.\n");
cudaEventRecord(stop, NULL);
cudaEventSynchronize(stop);
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop); // 计算了耗时、操作数以及操作速度
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
cublasDestroy(handle);
} printf("Computing result using host CPU...");
float *reference = (float *)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
printf("done.\n"); bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true)
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, , 1.0e-5f); printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); free(h_A);
free(h_B);
free(h_CUBLAS);
free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C); if (resCUBLAS == true)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
} int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n"); int devID = , sizeMult = ;
sMatrixSize matrix_size; initializeCUDA(argc, argv, devID, sizeMult, matrix_size); int matrix_result = matrixMultiply(argc, argv, devID, matrix_size); getchar();
return matrix_result;
}
▶ 输出结果:
[Matrix Multiply CUBLAS] - Starting...
GPU Device : "GeForce GTX 1070" with compute capability 6.1 MatrixA(,), MatrixB(,), MatrixC(,)
Computing result using CUBLAS...done.
Performance= 2887.08 GFlop/s, Time= 0.068 msec, Size= Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
▶ 涨姿势:
● 代码依然很烂。
多用了一个 h_C 根本没有用上,其作用被 h_CUBLAS 取代了,而且源代码中有free(h_C)却没有free(h_CUBLAS)。
float *h_C = (float *)malloc(mem_size_C);
...
free(h_C);
● 句柄的创造与销毁,定义于cublas_api.h 中
/*cublas_api.h*/
typedef struct cublasContext *cublasHandle_t; /*cublas_v2.h*/
#define cublasCreate cublasCreate_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle); #define cublasDestroy cublasDestroy_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);
● 矩阵乘法计算核心函数。实际上该函数不是用来专门计算矩阵乘法的,而且对应不同的数据类型(实数、复数)和数据精度(单精度、双精度)一共有四个函数。
#define cublasSgemm cublasSgemm_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2
(
cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc
);
● 定义在 helper_image.h 中的一个函数,用于比较两个长为 len 数组是否相等,允许容差为epsilon
inline bool sdkCompareL2fe(const float *reference, const float *data, const unsigned int len, const float epsilon)
● 调用cublas计算矩阵乘法的过程摘要(详细的参数说明和调用规则见另一篇随笔)
...// 准备d_A,d_B,d_C,其中 d_A 和 d_B 中存放了需要相乘的两个矩阵,d_C初始化自动为零矩阵 // 规定使用的线程块和线程尺寸
dim3 threads(, );
dim3 grid(, ); // 常数因子,计算 d_C = d_A * d_B 时设定为 α = 1.0, β = 0.0
const float alpha = 1.0f;
const float beta = 0.0f; // 创建句柄,需要在计算完成后销毁
cublasHandle_t handle;
cublasCreate(&handle); // 调用计算函数,注意参数顺序
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WB, HA, WA, &alpha, d_B, WB, d_A, WA, &beta, d_C, WB); cublasDestroy(handle); ...// 回收计算结果,顺序可以和销毁句柄交换
0_Simple__matrixMulCUBLAS的更多相关文章
随机推荐
- uva 1121 Subsequence
https://vjudge.net/problem/UVA-1121 题意: 给出一个正整数数列a,要求找出最短的连续的一个序列使得这个序列的所有数字之和大于等于S. 思路: 第一是由于序列都是正整 ...
- Python cPickle模块
新博客地址:http://gorthon.sinaapp.com/ 持久性就是指保持对象,甚至在多次执行同一程序之间也保持对象.通过本文,您会对 Python对象的各种持久性机制(从关系数据库到 Py ...
- handlebar JS模板使用笔记
直接上代码: (定义模板) (编译注入) ***知识点*** //数据必须为Json数据(强调:jsonp数据不行,和json是两种数据,jsonp多了callback回调函数来包裹json数据) 遍 ...
- 详细分析apache httpd反向代理的用法
html { font-family: sans-serif } body { margin: 0 } article,aside,details,figcaption,figure,footer,h ...
- python之testcenter操作
一.设置python环境 1. 从以下路径中将StcPython.py文件拷贝出来 Linux: /Installdir/Spirent_TestCenter_4.xx/Spirent_TestCen ...
- ASP.NET没有魔法——ASP.NET MVC 与数据库之EF实体类与数据库结构
大家都知道在关系型数据库中每张表的每个字段都会有自己的属性,如:数据类型.长度.是否为空.主外键.索引以及表与表之间的关系.但对于C#编写的类来说,它的属性只有一个数据类型和类与类之间的关系,但是在M ...
- hadoop各个名词的理解
Hadoop家族的各个成员 hadoop这个词已经流行好多年了,一提到大数据就会想到hadoop,那么hadoop的作用是什么呢? 官方定义:hadoop是一个开发和运行处理大规模数据的软件平台.核心 ...
- Elasticsearch分片、副本与路由(shard replica routing)
本文讲述,如何理解Elasticsearch的分片.副本和路由策略. 1.预备知识 1)分片(shard) Elasticsearch集群允许系统存储的数据量超过单机容量,实现这一目标引入分片策略sh ...
- Docker入门系列(一):目标和安排
Docker入门系列(一) 这个系列的教程来源于docker的官方文档,此文档的目的在于一步一步学习docker的使用方法. 这一系列的教程有如下几篇文档: docker安装启动 构建第一个docke ...
- SQL server 数据库备份大
首先简单的介绍一下Sql server 备份的类型有: 1:完整备份(所有的数据文件和部分的事务日志文件) 2:差异备份(最后一次完成备份后数据库改变的部分) 3:文件和文件组备份(对指定的文件和文件 ...