2.3CUDA矩阵乘法
CPU 矩阵乘法
能相乘的两个矩阵,必须满足一个矩阵的行数和第二个矩阵的列数相同.
A(N*P) * B(P*M) = C(N*M). 其中P是行数,N是列数, 从宽高的角度来说,即 A的宽度和B的高度是相同的.C矩阵 = ha * wb.
//C = A*B
void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int _wb)
{
float sum = ;
for (int i = ; i < _ha; ++i)
{
for (int j = ; j < _wb; ++j)
{
sum = ;
for (int k = ; k < _wa; ++k)
{
//i*_wa得到当前对应的是A矩阵中的哪一行,+k对应当前行的哪一列.矩阵A的stride是wa
//j对应当前矩阵B的哪一列,+k*wb依次表示第0行的j列,第1行的j列...第wa-1行的j列.矩阵B的stride是wb
sum += (float)_A[i*_wa+k]*(float)_B[k*_wb+ j];
}
_C[i*_wb+j] = (float)sum;
}
}
}
简单矩阵乘法
怎么划分呢?首先最初的想法是让每一个线程计算一个C(i,j),那么估算一下,应该需要height_c*width_c,也就是ha*wb个线程。进一步,我们将矩阵按一个大方格Block划分,如果一个方格Block大小是16*16,那么矩阵80*48的可以表示为5(*16) * 3(*16),即5*3个大格子(Grid),所以grid的划分自然就是(height_c/16) *(width_c/16)个线程了。
计算版本0:
__global__ void matrix_kernel_0(float* _C,const float* _A,const float *_B,int _wa,int _wb)
{
float sum = ;
//找出该线程所在的行列
int row = blockIdx.y*blockDim.y + threadIdx.y; // X 对应矩阵row, Y对应举证col
int col = blockIdx.x*blockDim.x + threadIdx.x; //线程Thread(row,col)负责计算C(row,col)
for (int i = ; i < _wa; ++i)
{
sum += _A[row*_wa + i]*_B[i*_wb + col];
}
_C[row*_wb + col] = sum;
}
这个是Heterogeneous Parallel Programming lab3:Basic Matrix Matrix Multiplication的代码:
#include <wb.h> #define wbCheck(stmt) \
do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -; \
} \
} while () #if 0 //This is C verison matrixMUl
//C = A*B
void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int _wb)
{
float sum = ;
for (int i = ; i < _ha; ++i)
{
for (int j = ; j < _wb; ++j)
{
sum = ;
for (int k = ; k < _wa; ++k)
{
sum += (float)_A[i*_wa+k]*(float)_B[k*_wb+ j];
}
_C[i*_wb+j] = (float)sum;
}
}
}
#endif // Compute C = A * B , Matrix C = hA * wB = rowA * columnB
__global__ void matrixMultiply(float *A, float *B, float *C, int numARows,
int numAColumns, int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
float sum = 0.0f; int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x; if(row < numCRows && col < numCColumns){
for (int i = ; i < numAColumns; ++i)
{
sum += A[row*numAColumns + i] * B[i*numBColumns + col];
}
C[row*numBColumns + col] = sum;
}
printf("C = %f\n",C[row*numBColumns + col]); } int main(int argc, char **argv) {
wbArg_t args;
float *hostA; // The A matrix
float *hostB; // The B matrix
float *hostC; // The output C matrix
float *deviceA;
float *deviceB;
float *deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this) args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host");
hostA =
( float * )wbImport(wbArg_getInputFile(args, ), &numARows, &numAColumns);
hostB =
( float * )wbImport(wbArg_getInputFile(args, ), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = ;
numCColumns = ;
if(numAColumns != numBRows){
wbLog(TRACE, "numAColumns != numBRows, Break ");
return ;
}
numCRows = numARows;
numCColumns = numBColumns;
unsigned int A_size = numARows * numAColumns * sizeof(float);
unsigned int B_size = numBRows * numBColumns * sizeof(float);
unsigned int C_size = numCRows * numCColumns * sizeof(float);
//@@ Allocate the hostC matrix
hostC = ( float * )malloc(C_size);
wbTime_stop(Generic, "Importing data and creating memory on host"); wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
wbCheck(cudaMalloc((void**)&deviceA, A_size));
wbCheck(cudaMalloc((void**)&deviceB, B_size));
wbCheck(cudaMalloc((void**)&deviceC, C_size));
wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice));
wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice));
wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice));
wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here
dim3 DimGrid((numCColumns-)/+, (numCRows-)/+, );
dim3 DimBlock(, , ); wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiply<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
//@@ Copy the GPU memory back to the CPU here
wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost)); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
wbCheck(cudaFree(deviceA));
wbCheck(cudaFree(deviceB));
wbCheck(cudaFree(deviceC)); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostC, numCRows, numCColumns); free(hostA);
free(hostB);
free(hostC); return ;
}
使用tile来划分矩阵乘法
另外一种思路,我们不让每一个线程完整计算一个C(i,j),通过C(i,j) = sum { A(i,k)*B(k,j) }发现,我们还可以再细度划分:
Csub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)} 0<=ksub < blockSize
C(i,j) = sum{Csub(i,j)}
就是把矩阵分成n*n个大的子块,然后每一个block负责计算子块i 和 子块j的子乘积,计算完毕后加起来则可。这里主要引入shared Memory来提高程序效率.
计算矩阵我们
__global__ void matrix_kernel_1(float* _C,const float* _A,const float *_B,int _wa,int _wb) //_wa是A矩阵的宽度,_wb是矩阵B的宽度
{
int bx = blockIdx.x; //Block X的当前位置
int by = blockIdx.y; //Block y的当前位置
int tx = threadIdx.x;
int ty = threadIdx.y; //该block要处理的A ,A的取值方向是X轴方向, B的取值方向是Y轴方向
int aBegin = _wa*(by*BLOCK_SIZE);//A(0,by) //在矩阵A上每个block的首地址
int aEnd = aBegin + _wa - ; //
int aStep = BLOCK_SIZE;//offsetA //因为A是横向取值,所以step是blocksize int bBegin = BLOCK_SIZE*bx;//B(bx,0) //矩阵B的首地址
int bStep = BLOCK_SIZE*_wb;//offsetB //因为B是纵向取值,所以step是blocksize*_wb. float cSub = ;
//每一个线程计算一个像素点,分成wa/block 次来计算,每次计算一段A(sub) * B(sub),最后累加得到C的结果.
//假设矩阵都是n*n的,那么旧的basicMatrix每个线程都需要执行2n次globalMemory的访问,这里用到sharedMemory只需要执行2n/blocksize,每个线程可以提高blocksize倍,
//每个block里面的thread都是通过读取sharedMemory来执行计算的,速度会非常快.
for (int a = aBegin,b = bBegin; a <= aEnd; a += aStep,b += bStep)
{
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
//每个线程负责一个元素拷贝,我们以block为单位来分析。假设blocksize=16, 一个block里面有 16*16个线程。
//每个block 可以填满需要用到的 As, 和Bs大小的矩阵。这里就是矩阵A里面的16*16的数据可以填满,保存在sharedMemory中。同样B矩阵也是。
As[ty][tx] = _A[a + _wa*ty + tx];
Bs[ty][tx] = _B[b + _wb*ty + tx]; __syncthreads(); //同步使得矩阵A,和矩阵B的第一个tile*tile的数据保存在As和Bs里,供下面的计算使用. //每个线程负责计算一个子块i 和 子块j的子乘积宽度是block_size,执行到wa/block_size次,累加可得到C的值
for (int k = ; k < BLOCK_SIZE; ++k)
{
cSub += As[ty][k]*Bs[k][tx];
} __syncthreads();
} //全局地址,向全局寄存器写回去
//一个线程负责一个元素,一个block负责一个子块
int cIndex = (by*BLOCK_SIZE + ty)*_wb + (bx*BLOCK_SIZE + tx);
_C[cIndex] = cSub;
}
二维矩阵的索引问题:
假设有一个32*48的矩阵,x的范围[0,47], y的范围是[0,31]。 在代码中是以一个二维数组保存,内存是连续的。目前我们要找到point(23,23)的索引:
一维指针指向的应该是: 23*48 + 23.
但是我们同样也可以用grid 和block 来划分这个矩阵:
grid(3,2) (2是列,即X维度,2是行,Y维度), block(16,16) 。 grid X的范围是是[0,2], Y的范围是[0,1]. 同理适用于block 我们也可以用下面的方式找到point(23,23)的索引:
point(23,23) 对应grid的坐标是(1,1) 对应的block坐标是(7,7)。block的宽度是16。
point.y = (by*Block_size + ty) = 1*16+7 =23
point.x = (bx*Block_size + tx)= 1*16+7 =23
point(x,y)的索引位置是: y * width + x
这个是Heterogeneous Parallel Programming lab4:Basic Matrix Matrix Multiplication的代码:
#include <wb.h> #define wbCheck(stmt) \
do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -; \
} \
} while () #define TILE_WIDTH 32 //block size ,each thread to calucate each block // Compute C = A * B
__global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows,
int numAColumns, int numBRows,
int numBColumns, int numCRows,
int numCColumns) {
//@@ Insert code to implement matrix multiplication here
//@@ You have to use shared memory for this MP __shared__ float sharedM[TILE_WIDTH][TILE_WIDTH];
__shared__ float sharedN[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by*TILE_WIDTH + ty;
int col = bx*TILE_WIDTH + tx;
float v = 0.0; for (int i = ; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++)
{
if (i*TILE_WIDTH + tx < numAColumns && row < numARows)
sharedM[ty][tx] = A[row*numAColumns + i*TILE_WIDTH + tx];
else
sharedM[ty][tx] = 0.0; if (i*TILE_WIDTH + ty < numBRows && col < numBColumns)
sharedN[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + col];
else
sharedN[ty][tx] = 0.0;
__syncthreads(); for(int j = ; j < TILE_WIDTH; j++)
v += sharedM[ty][j] * sharedN[j][tx];
__syncthreads();
} if (row < numCRows && col < numCColumns)
C[row*numCColumns + col] = v; } int main(int argc, char **argv) {
wbArg_t args;
float *hostA; // The A matrix
float *hostB; // The B matrix
float *hostC; // The output C matrix
float *deviceA;
float *deviceB;
float *deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this) args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host");
hostA =
( float * )wbImport(wbArg_getInputFile(args, ), &numARows, &numAColumns);
hostB =
( float * )wbImport(wbArg_getInputFile(args, ), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = ;
numCColumns = ;
if(numAColumns != numBRows){
wbLog(TRACE, "numAColumns != numBRows, Break ");
return ;
}
numCRows = numARows;
numCColumns = numBColumns;
unsigned int A_size = numARows * numAColumns * sizeof(float);
unsigned int B_size = numBRows * numBColumns * sizeof(float);
unsigned int C_size = numCRows * numCColumns * sizeof(float);
//@@ Allocate the hostC matrix
hostC = ( float * )malloc(C_size);
wbTime_stop(Generic, "Importing data and creating memory on host"); wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
wbCheck(cudaMalloc((void**)&deviceA, A_size));
wbCheck(cudaMalloc((void**)&deviceB, B_size));
wbCheck(cudaMalloc((void**)&deviceC, C_size));
wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice));
wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice));
wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice));
wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here
dim3 DimGrid(ceil(numCColumns / 32.0), ceil(numCRows / 32.0), );
dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, ); wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiplyShared<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
//@@ Copy the GPU memory back to the CPU here
wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost)); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
wbCheck(cudaFree(deviceA));
wbCheck(cudaFree(deviceB));
wbCheck(cudaFree(deviceC)); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostC, numCRows, numCColumns); free(hostA);
free(hostB);
free(hostC); return ;
}
2.3CUDA矩阵乘法的更多相关文章
- *HDU2254 矩阵乘法
奥运 Time Limit: 1000/1000 MS (Java/Others) Memory Limit: 65536/65536 K (Java/Others)Total Submissi ...
- *HDU 1757 矩阵乘法
A Simple Math Problem Time Limit: 3000/1000 MS (Java/Others) Memory Limit: 32768/32768 K (Java/Ot ...
- CH Round #30 摆花[矩阵乘法]
摆花 CH Round #30 - 清明欢乐赛 背景及描述 艺术馆门前将摆出许多花,一共有n个位置排成一排,每个位置可以摆花也可以不摆花.有些花如果摆在相邻的位置(隔着一个空的位置不算相邻),就不好看 ...
- POJ3070 Fibonacci[矩阵乘法]
Fibonacci Time Limit: 1000MS Memory Limit: 65536K Total Submissions: 13677 Accepted: 9697 Descri ...
- bzoj 2738 矩阵乘法
其实这题跟矩阵乘法没有任何卵关系,直接整体二分,用二维树状数组维护(刚刚学会>_<),复杂度好像有点爆炸(好像有十几亿不知道是不是算错了),但我们不能怂啊23333. #include&l ...
- 【BZOJ-2476】战场的数目 矩阵乘法 + 递推
2476: 战场的数目 Time Limit: 1 Sec Memory Limit: 128 MBSubmit: 58 Solved: 38[Submit][Status][Discuss] D ...
- 【BZOJ-1898】Swamp 沼泽鳄鱼 矩阵乘法
1898: [Zjoi2005]Swamp 沼泽鳄鱼 Time Limit: 5 Sec Memory Limit: 64 MBSubmit: 1012 Solved: 566[Submit][S ...
- 【Codeforces718C】Sasha and Array 线段树 + 矩阵乘法
C. Sasha and Array time limit per test:5 seconds memory limit per test:256 megabytes input:standard ...
- 矩阵乘法的MapReduce实现
对于任意矩阵M和N,若矩阵M的列数等于矩阵N的行数,则记M和N的乘积为P=M*N,其中mik 记做矩阵M的第i行和第k列,nkj记做矩阵N的第k行和第j列,则矩阵P中,第i行第j列的元素可表示为公式( ...
随机推荐
- Flume学习——Flume中事务的定义
首先要搞清楚的问题是:Flume中的事务用来干嘛? Flume中的事务用来保证消息的可靠传递. 当使用继承自BasicChannelSemantics的Channel时,Flume强制在操作Chann ...
- Apache Flume 简介
转自:http://blog.163.com/guaiguai_family/blog/static/20078414520138100562883/ Flume 是 Cloudera 公司开源出来的 ...
- static函数与普通函数
转自http://blog.163.com/sunshine_linting/blog/static/44893323201191294825184/ 全局变量(外部变量)的说明之前再冠以static ...
- POJ 3393 Lucky and Good Months by Gregorian Calendar
http://poj.org/problem?id=3393 题意 : 对于这篇长长的英语阅读,表示无语无语再无语,花了好长时间,终于读完了.题目中规定每周的周六日为假日,其他为工作日,若是一个月的第 ...
- 学点PYTHON基础的东东--数据结构,算法,设计模式---访问者模式
说实话,感觉不是特别多,可能没遇到过多场面, 所以对应用场景没感觉吧. 反正,各种模式就是把类的实例传来传去,久而久之,产生了一些规律...:) # 轮子,引擎, 车身这些定义好了都不需要变动 cla ...
- Android ActionBar隐藏修改图标和标题
有时候在一些子页面或者内容页面,不需要显示ActionBar的标题栏图标.可用如下方式进行设置. 首先获取到ActionBar对象 ActionBar actionBar=getActionBar() ...
- Java API ——Scanner类
1.Scanner类概述 JDK5以后用于获取用户的键盘输入,一个可以使用正则表达式来解析基本类型和字符串的简单文本扫描器.Scanner 使用分隔符模式将其输入分解为标记,默认情况下该分隔符模式与空 ...
- php.ini – 配置文件详解
详见: https://my.oschina.net/miaowang/blog/299546 这个文件必须命名为''php.ini''并放置在httpd.conf中的PHPIniDir指令指定的目录 ...
- Extension Method[上篇]
在C#3.0中,引入了一些列新的特性,比如: Implicitly typed local variable, Extension method,Lambda expression, Object i ...
- Codeforces Round #221 (Div. 2) Lever I.O.U.
这场cf 做的很差,,第一题犯了一个很低级的错误..把i写成了J.... 第二题 想的太复杂了...其实我们只需要 考虑每个人自己的负债情况就行了,就是假设每个人把别人 欠他的钱,拿过来还给别人..这 ...