使用CUDA的线性代数库cuBLAS来计算矩阵乘法。这里主要记录调用规则,关于乘法函数中详细的参数说明和调用规则见另一篇随笔。

▶ 源代码:

 #include <assert.h>
#include <helper_string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_functions.h>
#include <helper_cuda.h> #ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif // 存放各矩阵维数的结构体
typedef struct _matrixSize
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize; // CPU 计算矩阵乘法。三个参数分别用于行定位、行程定位、列定位,没有查错机制。
void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = ; i < hA; ++i) // 从上往下数 i 行
{
for (unsigned int j = ; j < wB; ++j) // 从左往右数 j 列
{
double sum = ;
for (unsigned int k = ; k < wA; ++k) // 行程长
{
double a = A[i * wA + k]; // 中间过程用 double,结果输出 float
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
} // 初始化数组
void randomInit(float *data, int size)
{
for (int i = ; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
} //输出两个矩阵的不相等的值及其位置,允许容差为 fListTol ,最多输出 iListLength 个
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
int i, j, k;
int error_count = ; for (j = ; j < height; j++)
{
if (error_count < iListLength)
printf("\n Row %d:\n", j); for (i = ; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
error_count++;
}
}
}
printf(" \n Total Errors = %d\n", error_count);
} // 初始化设备。包括选择设备,设定维数
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
cudaError_t error; // 选择设备,略去了检查错误部分
devID = ;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
}
error = cudaGetDevice(&devID);
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
iSizeMultiple = max(min(iSizeMultiple, ), );
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // Fermi 以上构架的计算机使用更大的线程块(blockDim),这里用了32
int block_size = (deviceProp.major < ) ? : ; matrix_size.uiWA = * block_size * iSizeMultiple;
matrix_size.uiHA = * block_size * iSizeMultiple;
matrix_size.uiWB = * block_size * iSizeMultiple;
matrix_size.uiHB = * block_size * iSizeMultiple;
matrix_size.uiWC = * block_size * iSizeMultiple;
matrix_size.uiHC = * block_size * iSizeMultiple; printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC, matrix_size.uiWC);
if (matrix_size.uiWA != matrix_size.uiHB || matrix_size.uiHA != matrix_size.uiHC || matrix_size.uiWB != matrix_size.uiWC)
{
printf("ERROR: Matrix sizes do not match!\n");
exit(-);
}
} // 计算矩阵乘法部分
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, devID);
int block_size = (deviceProp.major < ) ? : ; unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B); srand();
randomInit(h_A, size_A);
randomInit(h_B, size_B); float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
//float *h_C = (float *) malloc(mem_size_C); // TM 没有用!
float *h_CUBLAS = (float *) malloc(mem_size_C); // 保存 d_C 回传的结果
cudaMalloc((void **) &d_A, mem_size_A);
cudaMalloc((void **) &d_B, mem_size_B);
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_C, mem_size_C); dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); printf("Computing result using CUBLAS...");
int nIter = ; //cuBLAS代码块
{
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaEvent_t start, stop; cublasCreate(&handle); //热身,注意转置的问题,不采用 <<< >>> 调用核函数
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB); cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord(start, NULL); for (int j = ; j < nIter; j++)
{
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
}
printf("done.\n");
cudaEventRecord(stop, NULL);
cudaEventSynchronize(stop);
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop); // 计算了耗时、操作数以及操作速度
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
cublasDestroy(handle);
} printf("Computing result using host CPU...");
float *reference = (float *)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
printf("done.\n"); bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true)
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, , 1.0e-5f); printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); free(h_A);
free(h_B);
free(h_CUBLAS);
free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C); if (resCUBLAS == true)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
} int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n"); int devID = , sizeMult = ;
sMatrixSize matrix_size; initializeCUDA(argc, argv, devID, sizeMult, matrix_size); int matrix_result = matrixMultiply(argc, argv, devID, matrix_size); getchar();
return matrix_result;
}

▶ 输出结果:

[Matrix Multiply CUBLAS] - Starting...
GPU Device : "GeForce GTX 1070" with compute capability 6.1 MatrixA(,), MatrixB(,), MatrixC(,)
Computing result using CUBLAS...done.
Performance= 2887.08 GFlop/s, Time= 0.068 msec, Size= Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势:

● 代码依然很烂。
多用了一个 h_C 根本没有用上,其作用被 h_CUBLAS 取代了,而且源代码中有free(h_C)却没有free(h_CUBLAS)。

 float *h_C = (float *)malloc(mem_size_C);
...
free(h_C);

● 句柄的创造与销毁,定义于cublas_api.h 中

 /*cublas_api.h*/
typedef struct cublasContext *cublasHandle_t; /*cublas_v2.h*/
#define cublasCreate cublasCreate_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle); #define cublasDestroy cublasDestroy_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);

● 矩阵乘法计算核心函数。实际上该函数不是用来专门计算矩阵乘法的,而且对应不同的数据类型(实数、复数)和数据精度(单精度、双精度)一共有四个函数。

 #define cublasSgemm cublasSgemm_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2
(
cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc
);

● 定义在 helper_image.h 中的一个函数,用于比较两个长为 len 数组是否相等,允许容差为epsilon

 inline bool sdkCompareL2fe(const float *reference, const float *data, const unsigned int len, const float epsilon)

● 调用cublas计算矩阵乘法的过程摘要(详细的参数说明和调用规则见另一篇随笔)

 ...// 准备d_A,d_B,d_C,其中 d_A 和 d_B 中存放了需要相乘的两个矩阵,d_C初始化自动为零矩阵

 // 规定使用的线程块和线程尺寸
dim3 threads(, );
dim3 grid(, ); // 常数因子,计算 d_C = d_A * d_B 时设定为 α = 1.0, β = 0.0
const float alpha = 1.0f;
const float beta = 0.0f; // 创建句柄,需要在计算完成后销毁
cublasHandle_t handle;
cublasCreate(&handle); // 调用计算函数,注意参数顺序
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WB, HA, WA, &alpha, d_B, WB, d_A, WA, &beta, d_C, WB); cublasDestroy(handle); ...// 回收计算结果,顺序可以和销毁句柄交换

0_Simple__matrixMulCUBLAS的更多相关文章

随机推荐

  1. U方法

    U方法用于完成对URL地址的组装,特点在于可以自动根据当前的URL模式和设置生成对应的URL地址,格式为:U('地址','参数','伪静态','是否跳转','显示域名');在模板中使用U方法而不是固定 ...

  2. Spring常用注解介绍【经典总结】

    Spring的一个核心功能是IOC,就是将Bean初始化加载到容器中,Bean是如何加载到容器的,可以使用Spring注解方式或者Spring XML配置方式. Spring注解方式减少了配置文件内容 ...

  3. Qt 打开文件的默认路径 QFileDialog::getOpenFileName()

    为了说明QFileDialog::getOpenFileName()函数的用法,还是先把函数签名放在这里:   QString QFileDialog::getOpenFileName (       ...

  4. SiganlR 系列之概述

    简介 SignalR 是微软的 http 长连接(以下简称长连接)框架,它的出现为我们提供了一套行之有效的实时通信的解决方案. 背景 在http 1.0 时代,preRequest 都会建立新的tcp ...

  5. hdu 4123--Bob’s Race(树形DP+RMQ)

    题目链接 Problem Description Bob wants to hold a race to encourage people to do sports. He has got troub ...

  6. Spring REST 与 Zuul 代理

    http://www.baeldung.com/spring-rest-with-zuul-proxy 作者: Eugen Paraschiv 译者: http://oopsguy.com 1.概述 ...

  7. 【Spring】面向切面之AOP

    前言 前面已经讲解了bean的装配技术,接着学习Spring中另外一个核心概念:切面. 面向切面 面向切面编程 切面能够帮助模块化横切关注点,横切关注点可以被描述为影响应用的功能,如为业务添加安全和事 ...

  8. (转)simhash算法原理及实现

    simhash是google用来处理海量文本去重的算法. google出品,你懂的. simhash最牛逼的一点就是将一个文档,最后转换成一个64位的字节,暂且称之为特征字,然后判断重复只需要判断他们 ...

  9. Python协程爬取妹子图(内有福利,你懂得~)

    项目说明: 1.项目介绍   本项目使用Python提供的协程+scrapy中的选择器的使用(相当好用)实现爬取妹子图的(福利图)图片,这个学会了,某榴什么的.pow(2, 10)是吧! 2.用到的知 ...

  10. javascript(js)创建对象的模式与继承的几种方式

    1.js创建对象的几种方式 工厂模式 为什么会产生工厂模式,原因是使用同一个接口创建很多对象,会产生大量的重复代码,为了解决这个问题,产生了工厂模式. function createPerson(na ...