使用CUDA的线性代数库cuBLAS来计算矩阵乘法。这里主要记录调用规则,关于乘法函数中详细的参数说明和调用规则见另一篇随笔。

▶ 源代码:

 #include <assert.h>
#include <helper_string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_functions.h>
#include <helper_cuda.h> #ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif // 存放各矩阵维数的结构体
typedef struct _matrixSize
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize; // CPU 计算矩阵乘法。三个参数分别用于行定位、行程定位、列定位,没有查错机制。
void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = ; i < hA; ++i) // 从上往下数 i 行
{
for (unsigned int j = ; j < wB; ++j) // 从左往右数 j 列
{
double sum = ;
for (unsigned int k = ; k < wA; ++k) // 行程长
{
double a = A[i * wA + k]; // 中间过程用 double,结果输出 float
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
} // 初始化数组
void randomInit(float *data, int size)
{
for (int i = ; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
} //输出两个矩阵的不相等的值及其位置,允许容差为 fListTol ,最多输出 iListLength 个
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
int i, j, k;
int error_count = ; for (j = ; j < height; j++)
{
if (error_count < iListLength)
printf("\n Row %d:\n", j); for (i = ; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
error_count++;
}
}
}
printf(" \n Total Errors = %d\n", error_count);
} // 初始化设备。包括选择设备,设定维数
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
cudaError_t error; // 选择设备,略去了检查错误部分
devID = ;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
}
error = cudaGetDevice(&devID);
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
iSizeMultiple = max(min(iSizeMultiple, ), );
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // Fermi 以上构架的计算机使用更大的线程块(blockDim),这里用了32
int block_size = (deviceProp.major < ) ? : ; matrix_size.uiWA = * block_size * iSizeMultiple;
matrix_size.uiHA = * block_size * iSizeMultiple;
matrix_size.uiWB = * block_size * iSizeMultiple;
matrix_size.uiHB = * block_size * iSizeMultiple;
matrix_size.uiWC = * block_size * iSizeMultiple;
matrix_size.uiHC = * block_size * iSizeMultiple; printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC, matrix_size.uiWC);
if (matrix_size.uiWA != matrix_size.uiHB || matrix_size.uiHA != matrix_size.uiHC || matrix_size.uiWB != matrix_size.uiWC)
{
printf("ERROR: Matrix sizes do not match!\n");
exit(-);
}
} // 计算矩阵乘法部分
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, devID);
int block_size = (deviceProp.major < ) ? : ; unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B); srand();
randomInit(h_A, size_A);
randomInit(h_B, size_B); float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
//float *h_C = (float *) malloc(mem_size_C); // TM 没有用!
float *h_CUBLAS = (float *) malloc(mem_size_C); // 保存 d_C 回传的结果
cudaMalloc((void **) &d_A, mem_size_A);
cudaMalloc((void **) &d_B, mem_size_B);
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_C, mem_size_C); dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); printf("Computing result using CUBLAS...");
int nIter = ; //cuBLAS代码块
{
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaEvent_t start, stop; cublasCreate(&handle); //热身,注意转置的问题,不采用 <<< >>> 调用核函数
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB); cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord(start, NULL); for (int j = ; j < nIter; j++)
{
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
&alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
}
printf("done.\n");
cudaEventRecord(stop, NULL);
cudaEventSynchronize(stop);
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop); // 计算了耗时、操作数以及操作速度
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
cublasDestroy(handle);
} printf("Computing result using host CPU...");
float *reference = (float *)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
printf("done.\n"); bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true)
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, , 1.0e-5f); printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); free(h_A);
free(h_B);
free(h_CUBLAS);
free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C); if (resCUBLAS == true)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
} int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n"); int devID = , sizeMult = ;
sMatrixSize matrix_size; initializeCUDA(argc, argv, devID, sizeMult, matrix_size); int matrix_result = matrixMultiply(argc, argv, devID, matrix_size); getchar();
return matrix_result;
}

▶ 输出结果:

[Matrix Multiply CUBLAS] - Starting...
GPU Device : "GeForce GTX 1070" with compute capability 6.1 MatrixA(,), MatrixB(,), MatrixC(,)
Computing result using CUBLAS...done.
Performance= 2887.08 GFlop/s, Time= 0.068 msec, Size= Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势:

● 代码依然很烂。
多用了一个 h_C 根本没有用上,其作用被 h_CUBLAS 取代了,而且源代码中有free(h_C)却没有free(h_CUBLAS)。

 float *h_C = (float *)malloc(mem_size_C);
...
free(h_C);

● 句柄的创造与销毁,定义于cublas_api.h 中

 /*cublas_api.h*/
typedef struct cublasContext *cublasHandle_t; /*cublas_v2.h*/
#define cublasCreate cublasCreate_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle); #define cublasDestroy cublasDestroy_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);

● 矩阵乘法计算核心函数。实际上该函数不是用来专门计算矩阵乘法的,而且对应不同的数据类型(实数、复数)和数据精度(单精度、双精度)一共有四个函数。

 #define cublasSgemm cublasSgemm_v2
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2
(
cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc
);

● 定义在 helper_image.h 中的一个函数,用于比较两个长为 len 数组是否相等,允许容差为epsilon

 inline bool sdkCompareL2fe(const float *reference, const float *data, const unsigned int len, const float epsilon)

● 调用cublas计算矩阵乘法的过程摘要(详细的参数说明和调用规则见另一篇随笔)

 ...// 准备d_A,d_B,d_C,其中 d_A 和 d_B 中存放了需要相乘的两个矩阵,d_C初始化自动为零矩阵

 // 规定使用的线程块和线程尺寸
dim3 threads(, );
dim3 grid(, ); // 常数因子,计算 d_C = d_A * d_B 时设定为 α = 1.0, β = 0.0
const float alpha = 1.0f;
const float beta = 0.0f; // 创建句柄,需要在计算完成后销毁
cublasHandle_t handle;
cublasCreate(&handle); // 调用计算函数,注意参数顺序
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WB, HA, WA, &alpha, d_B, WB, d_A, WA, &beta, d_C, WB); cublasDestroy(handle); ...// 回收计算结果,顺序可以和销毁句柄交换

0_Simple__matrixMulCUBLAS的更多相关文章

随机推荐

  1. ES6 Promise 对象

    Promise 的含义 Promise 是异步编程的一种解决方案,比传统的解决方案--回调函数和事件--更合理和更强大.它由社区最早提出和实现,ES6 将其写进了语言标准,统一了用法,原生提供了Pro ...

  2. java文档操作

    背景:因是动态报表,1)作成excel模版2)数据填充3)转化为PDF提出解决方法:[open source]1)Apache Poi+I text2) JodConvert+OpenOffice/l ...

  3. Python 编程基础之高阶函数篇(一)

      高阶函数:能接受函数作为参数的函数. 如: f=abs def   add(x,y,f): return f(x)+f(y) 如果我们用:add(-5,9,f)来调用该高阶函数,则返回结果为:14 ...

  4. CodeForces 242E二维线段树

                                                                                           E. XOR on Seg ...

  5. 考了3年,工作四年,零基础在职终于拿到CFA证书

    大家都知道CFA Charterholder是独有的全球公认的投资管理从业人员高职业水平和道德水准的有力证明,是金融界卓越专业成就的象征:CFA资格强调和遵循极其严格的职业操守和道德准则,世界各主要发 ...

  6. 计蒜客 2017 NOIP 提高组模拟赛(四)Day1 T2 小X的密室

    https://nanti.jisuanke.com/t/17323 小 X 正困在一个密室里,他希望尽快逃出密室. 密室中有 N 个房间,初始时,小 X 在 1号房间,而出口在 N号房间. 密室的每 ...

  7. HDU3605 Escape

    思想:缩点+sap Max,t还可以缩小,优化,高数课写的,有点丑,暂时懒得改. #include<cstdio> #include<cstdlib> #include< ...

  8. java程序调用存储过程和存储函数

    java程序调用存储过程 jdbcUtil.java文件 package cn.itcast.oracle.utils; import java.sql.Connection; import java ...

  9. dom4j之小小工具

    dom4j经常不用,方法忘了又记,故做出读取xml和把document写入xml的小小工具~~~ /** * 读取document和将document对象写入到xml的小工具 * 使用该类必须给出do ...

  10. TableView的性能优化

    现在市场上的iOS应用程序界面中使用最多的UI控件是什么? 答案肯定是UITableView,几乎每一款App都有很多的界面是由UITableView实现的,所以为了做出一款优秀的App,让用户有更好 ...