▶ 目前为止能跑的所有代码及其结果(2019年2月24日),之后添加:DIA 乘法 GPU 版;其他维度的乘法(矩阵乘矩阵);其他稀疏矩阵格式之间的相互转化

 #include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <malloc.h>
#include <math.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h" #define ROW (8192)
#define COL (8192)
#define RATIO 0.1 // 系数矩阵非零元素占比
#define EPSILON 0.0001 // 浮点数比较
#define THREAD_SIZE 256
#define SEED 1 // (unsigned int)time(NULL)
#define MAX(x,y) (((x)>(y))?(x):(y))
#define CEIL(x,y) (int)(( x - 1 ) / y + 1)
#define INT // 数字格式用 int 还是 float #ifdef INT
typedef int format;
#else
typedef float format;
#endif // 矩阵存储格式
typedef struct // 顺序格式
{
int row; // 行数
int col; // 列数
int count; // 非零元个数(用于转换,不用于计算)
format *data; // 元素的值
}
MAT; typedef struct // Compressed Sparse Row 格式
{
int row; // 行数
int col; // 列数
format *data; // 元素的值
int *index; // 元素的列号
int *ptr; // 每行首元在 index 中的下标,最后一个元素的值等于矩阵非零元个数
}
CSR; typedef struct // Compressed Sparse Row 格式
{
int row; // 行数
int col; // 列数
format *data; // 元素的值
int *index; // 元素的行号
int *ptr; // 每列首元在 index 中的下标,最后一个元素的值等于矩阵非零元个数
}
CSC; typedef struct // ELLPACK 格式
{
int row; // 行数
int col; // 列数,相当于MAT格式的行数
int colOrigin; // 原列数,相当于MAT格式的列数
format *data; // 元素的值
int *index; // 元素在MAT格式中的列号
}
ELL; typedef struct // Coordinate 格式
{
int row; // 行数
int col; // 列数
int count; // 非零元个数
int *rowIndex; // 行向量
int *colIndex; // 列向量
format *data; // 元素的值
}
COO; typedef struct // Diagonal 格式
{
int row; // 行数
int col; // 列数
int colOrigin; // 原列数
format *data; // 元素的值
int *index; // 原矩阵各对角线是否非零
}
DIA; // 全局指针
__managed__ MAT *aMAT, *xMAT, *yMATRef, *yMATCal;
__managed__ CSR *aCSR;
__managed__ ELL *aELL;
__managed__ COO *aCOO; // 通用函数
__host__ __device__ inline void checkNULL(const void *input) // 有点问题,设备函数无法使用 exit(1) 来推出
{
if (input == NULL)
printf("\n\tfind a NULL!");
return;
} __host__ inline void checkCudaError(cudaError input)
{
if (input != cudaSuccess)
{
printf("\n\tfind a cudaError!");
exit();
}
return;
} int checkEqual(const format * in1, const format * in2, const int length)// 注意两向量相等时返 0,否则返回“值不等的元素下标加一”
{
int i;
for (i = ; i < length; i++)
{
#ifdef INT
if (in1[i] != in2[i])
#else
if (fabs(in1[i] - in2[i]) > EPSILON)
#endif
{
printf("\n\tError at i = %d\n\tin1[i] = %10f, in2[i] = %10f\n", i, (float)in1[i], (float)in2[i]);
return i + ;
}
}
printf("\n\tCheck output, passed.\n");
return ;
} // 打印矩阵
void print(const char* info, const MAT *in)// 打印MAT格式的矩阵
{
printf("%s\n\tMAT,\n\trow = %d, col = %d, count = %d", info, in->row, in->col, in->count);
printf("\n\tdata =\n\t");
for (int i = ; i < in->row * in->col; i++)
{
#ifdef INT
printf("%d ", in->data[i]);
#else
printf("%.2f ", in->data[i]);
#endif
if ((i + ) % in->col == )
printf("\n\t");
}
printf("\n");
return;
} void print(const char* info, const CSR *in)// 打印CSR格式的矩阵
{
printf("%s\n\tCSR,\n\trow = %d, col = %d", info, in->row, in->col);
printf("\n\tdata =\n\t");
for (int i = ; i < in->ptr[in->row]; i++)
#ifdef INT
printf("%d ", in->data[i]);
#else
printf("%.2f ", in->data[i]);
#endif
printf("\n\tindex =\n\t");
for (int i = ; i < in->ptr[in->row]; i++)
printf("%d ", in->index[i]);
printf("\n\tptr =\n\t");
for (int i = ; i < in->row + ; i++)
printf("%d ", in->ptr[i]);
printf("\n\n");
return;
} void print(const char* info, const ELL *in)// 打印ELL格式的矩阵
{
int i;
printf("%s\n\tELL,\n\trow = %d, col = %d, colOrigin = %d", info, in->row, in->col, in->colOrigin);
printf("\n\tdata =\n\t");
for (i = ; i < in->row * in->col; i++)
{
#ifdef INT
printf("%d ", in->data[i]);
#else
printf("%.2f ", in->data[i]);
#endif
if ((i + ) % in->col == )
printf("\n\t");
}
printf("index =\n\t");
for (i = ; i < in->row * in->col; i++)
{
printf("%d ", in->index[i]);
if ((i + ) % in->col == )
printf("\n\t");
}
printf("\n\n");
return;
} void print(const char* info, const COO *in)// 打印COO格式的矩阵
{
int i;
printf("%s\n\tCOO,\n\trow = %d, col = %d, count = %d", info, in->row, in->col, in->count);
printf("\n\t(rowIndex, colIndex, data) =\n\t");
for (i = ; i < in->count; i++)
{
#ifdef INT
printf("(%d,%d,%d)\n\t", in->rowIndex[i], in->colIndex[i], in->data[i]);
#else
printf("(%d,%d,%.2f)\n\t", in->rowIndex[i], in->colIndex[i], in->data[i]);
#endif
}
printf("\n\n");
return;
} void print(const char* info, const DIA *in)// 打印DIA格式的矩阵
{
int i;
printf("%s\n\tDIA,\n\trow = %d, col = %d, colOrigin = %d", info, in->row, in->col, in->colOrigin,in->colOrigin);
printf("\n\tdata =\n\t");
int * inverseIndex = (int *)malloc(sizeof(int) * in->col);
for (int i = , j = ; i < in->row + in->col - ; i++)
{
if (in->index[i] == )
{
inverseIndex[j] = i;
j++;
}
}
for (i = ; i < in->row * in->col; i++)
{
if (i / in->col < in->row - - inverseIndex[i % in->col] || i / in->col > inverseIndex[in->col - ] - inverseIndex[i % in->col])
printf("* ");
else
#ifdef INT
printf("%d ", in->data[i]);
#else
printf("%.2f ", in->data[i]);
#endif
if ((i + ) % in->col == )
printf("\n\t");
}
printf("index =\n\t");
for (i = ; i < in->row + in->col - ; i++)
printf("%d ", in->index[i]);
printf("\n\n");
free(inverseIndex);
return;
} // 矩阵初始化与清理
MAT *initializeMAT(const int row, const int col, const int count = )// 初始化MAT,注意非零元默认为 0
{
MAT *temp;
checkCudaError(cudaMallocManaged((void **)&temp, sizeof(MAT)));
temp->row = row;
temp->col = col;
temp->count = count;
checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col));
return temp;
} // 统计MAT形式的矩阵中的非零元素个数
#define COUNT_MAT(in) \
{ \
checkNULL(in); \
int i, zero; \
for (i = zero = ; i < in->row * in->col; i++) \
zero += !in->data[i]; \
in->count = in->row * in->col - zero; \
} int freeMatrix(MAT *in)// 释放MAT
{
checkNULL(in);
cudaFree(in->data);
cudaFree(in);
return ;
} CSR * initializeCSR(const int row, const int col, const int count)// 初始化CSR,要求给出 count
{
CSR *temp;
checkCudaError(cudaMallocManaged((void **)&temp, sizeof(CSR)));
temp->row = row;
temp->col = col;
checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * count));
checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(int) * count));
checkCudaError(cudaMallocManaged((void **)&temp->ptr, sizeof(int) * (row + )));
return temp;
} int freeMatrix(CSR *in)// 释放CSR
{
checkNULL(in);
cudaFree(in->data);
cudaFree(in->index);
cudaFree(in->ptr);
cudaFree(in);
return ;
} ELL * initializeELL(const int row, const int col, const int colOrigin)// 初始化ELL,要求给出原列数
{
ELL *temp;
checkCudaError(cudaMallocManaged((void **)&temp, sizeof(ELL)));
temp->row = row;
temp->col = col;
temp->colOrigin = colOrigin;
checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col));
checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(int) * row * col));
return temp;
} int freeMatrix(ELL *in)// 释放ELL
{
cudaFree(in->data);
cudaFree(in->index);
cudaFree(in);
return ;
} COO * initializeCOO(const int row, const int col, const int count)// 初始化COO,要求给出 count
{
COO * temp;
checkCudaError(cudaMallocManaged((void **)&temp, sizeof(COO)));
temp->row = row;
temp->col = col;
temp->count = count;
checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * count));
checkCudaError(cudaMallocManaged((void **)&temp->rowIndex, sizeof(int) * count));
checkCudaError(cudaMallocManaged((void **)&temp->colIndex, sizeof(int) * count));
return temp;
} int freeMatrix(COO *in)// 释放COO
{
checkNULL(in);
cudaFree(in->data);
cudaFree(in->rowIndex);
cudaFree(in->colIndex);
cudaFree(in);
return ;
} DIA * initializeDIA(const int row, const int col, const int colOrigin)// 初始化DIA,要求给出原行数、原列数
{
DIA * temp;
checkCudaError(cudaMallocManaged((void **)&temp, sizeof(DIA)));
temp->row = row;
temp->col = col;
temp->colOrigin = colOrigin;
checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col));
checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(format) * (row + col - )));
return temp;
} int freeMatrix(DIA *in)// 释放DIA
{
checkNULL(in);
cudaFree(in->data);
cudaFree(in->index);
cudaFree(in);
return ;
} // 矩阵格式的相互转化
CSR * MATToCSR(const MAT *in) // MAT 转 CSR
{
checkNULL(in);
CSR * out = initializeCSR(in->row, in->col, in->count);
checkNULL(out); out->ptr[] = ;
for (int i = , j = , k = ; i < in->row * in->col; i++) // i 遍历 in->data
{
if (in->data[i] != ) // 找到非零元
{
if (j == in->count) // 在 out->data 已经填满了的基础上又发现了非零元,错误
return NULL;
out->data[j] = in->data[i]; // 填充非零元素
out->index[j] = i % in->col; // 填充列号
j++;
}
if ((i + ) % in->col == ) // 到了最后一列,写入行指针号
out->ptr[k++] = j;
}
return out;
} MAT * CSRToMAT(const CSR *in) // CSR转MAT
{
checkNULL(in);
MAT *out = initializeMAT(in->row, in->col, in->ptr[in->row]);
checkNULL(out); memset(out->data, , sizeof(format) * in->row * in->col);
for (int i = ; i < in->row; i++) // i 遍历行
{
for (int j = in->ptr[i]; j < in->ptr[i + ]; j++) // j 遍历列
out->data[i * in->col + in->index[j]] = in->data[j];
}
return out;
} ELL * MATToELL(const MAT *in)// MAT转ELL
{
checkNULL(in); int i, j, maxElement;
for (i = j = maxElement = ; i < in->row * in->col; i++) // i 遍历 in->data,j 记录该行非零元素数,maxElement 记录一行非零元素最大值
{
if (in->data[i] != ) // 找到非零元
j++;
if ((i + ) % in->col == ) // 行末,更新 maxElement
{
maxElement = MAX(j, maxElement);
j = ; // 开始下一行之前清空 j
}
}
format* temp_data=(format *)malloc(sizeof(format) * in->row * maxElement); // 临时数组,将列数压缩到 maxElement
checkNULL(temp_data);
int* temp_index = (int *)malloc(sizeof(int) * in->row * maxElement);
checkNULL(temp_index);
memset(temp_data, , sizeof(format) * in->row * maxElement);
memset(temp_index, , sizeof(int) * in->row * maxElement);
for (i = j = ; i < in->row * in->col; i++) // i 遍历 in->data,j 记录该行非零元素数,把 in 中每行的元素往左边推
{
if (in->data[i] != ) // 找到非零元
{
temp_data[i / in->col * maxElement + j] = in->data[i]; // 存放元素
temp_index[i / in->col * maxElement + j] = i % in->col; // 记录所在的列号
j++;
}
if ((i + ) % in->col == ) // 行末,将剩余位置的下标记作 -1,即无效元素
{
for (j += i / in->col * in->col; j < maxElement * (i / in->col + ); j++) // 使得 j 指向本行最后一个非零元素之后的元素,再开始填充
temp_index[j] = -;
j = ; // 开始下一行之前清空 j
}
}
ELL *out = initializeELL(maxElement, in->row, in->col); // 最终输出,如果不转置的话不要这部分
checkNULL(out);
for (i = ; i < out->row * out->col; i++) // 将 temp_data 和 temp_index 转置以提高缓存利用
{
out->data[i] = temp_data[i % out->col * out->row + i / out->col];
out->index[i] = temp_index[i % out->col * out->row + i / out->col];
}
free(temp_data);
free(temp_index);
return out;
} MAT * ELLToMAT(const ELL *in) // ELL转MAT
{
checkNULL(in);
MAT *out = initializeMAT(in->col, in->colOrigin);
checkNULL(out); for (int i = ; i < in->row * in->col; i++) // i 遍历 out->data
{
if (in->index[i] < ) // 注意跳过无效元素
continue;
out->data[i % in->col * in->colOrigin + in->index[i]] = in->data[i];
}
COUNT_MAT(out);
return out;
} COO * MATToCOO(const MAT *in) // MAT转COO
{
checkNULL(in);
COO *out = initializeCOO(in->row, in->col, in->count); for (int i=, j = ; i < in->row * in->col; i++)
{
#ifdef INT
if (in->data[i] != )
#else
if (fabs(in->data[i]) > EPSILON)
#endif
{
out->data[j] = in->data[i];
out->rowIndex[j] = i / in->col;
out->colIndex[j] = i % in->col;
j++;
}
}
return out;
} MAT * COOToMAT(const COO *in) // COO转MAT
{
checkNULL(in);
MAT *out = initializeMAT(in->row, in->col, in->count);
checkNULL(out); for (int i = ; i < in->row * in->col; i++)
out->data[i] = ;
for (int i = ; i < in->count; i++)
out->data[in->rowIndex[i] * in->col + in->colIndex[i]] = in->data[i];
return out;
} DIA * MATToDIA(const MAT *in) // MAT转DIA
{
checkNULL(in); int *index = (int *)malloc(sizeof(int)*(in->row + in->col - ));
for (int diff = in->row - ; diff > ; diff--) // 左侧零对角线情况
{
int flagNonZero = ;
for (int i = ; i < in->col && i + diff < in->row; i++) // i 沿着对角线方向遍历 in->data,flagNonZero 记录该对角线是否全部为零元
{
#ifdef INT
if (in->data[(i + diff) * in->col + i] != )
#else
if (fabs(in->data[(i + diff) * in->col + i]) > EPSILON)
#endif
flagNonZero = ;
}
index[in->row - - diff] = flagNonZero; // 标记该对角线上有非零元
}
for (int diff = in->col - ; diff >= ; diff--) // 右侧零对角线情况
{
int flagNonZero = ;
for (int j = ; j < in->row && j + diff < in->col; j++)
{
#ifdef INT
if (in->data[j * in->col + j + diff] != )
#else
if (fabs(in->data[j * in->col + j + diff]) > EPSILON)
#endif
flagNonZero = ;
}
index[in->row - + diff] = flagNonZero; // 标记该对角线上有非零元
}
int *prefixSumIndex = (int *)malloc(sizeof(int)*(in->row + in->col - ));
prefixSumIndex[] = index[];
for (int i = ; i < in->row + in->col - ; i++) // 闭前缀和,prefixSumIndex[i] 表示原矩阵第 0 ~ i 条对角线中共有多少条非零对角线(含)
prefixSumIndex[i] = prefixSumIndex[i-] + index[i]; // index[in->row + in->col -2] 表示原矩阵非零对角线条数,等于 DIA 矩阵列数
DIA *out = initializeDIA(in->row, prefixSumIndex[in->row + in->col - ], in->col);
checkNULL(out); memset(out->data, , sizeof(int)*out->row * out->col);
for (int i = ; i < in->row + in->col - ; i++)
out->index[i] = index[i]; // index 搬进 out
for (int i = ; i < in->row; i++) // i,j 遍历原矩阵,将元素搬进 out
{
for (int j = ; j < in->col; j++)
{
int temp = j - i + in->row - ;
if (index[temp] == )
continue;
out->data[i * out->col + (temp > ? prefixSumIndex[temp - ] : )] = in->data[i * in->col + j]; // 第 row - 1 行第 0 列元素 temp == 0,单独处理
}
}
free(index);
free(prefixSumIndex);
return out;
} MAT * DIAToMAT(const DIA *in) // DIA转MAT
{
checkNULL(in);
MAT *out = initializeMAT(in->row, in->colOrigin);
checkNULL(out); int * inverseIndex = (int *)malloc(sizeof(int) * in->col);
for (int i = , j = ; i < in->row + in->col - ; i++) // 求一个 index 的逆,即 DIA 中第 i 列对应原矩阵第 inverseIndex[i] 对角线
{ // 原矩阵对角线编号 (row-1, 0) 为第 0 条,(0, 0) 为第 row - 1 条,(col-1, 0) 为第 row + col - 2 条
if (in->index[i] == )
{
inverseIndex[j] = i;
j++;
}
}
for (int i = ; i < in->row; i++) // i 遍历 in->data 行,j 遍历 in->data 列
{
for (int j = ; j < in->col; j++)
{
if (i < in->row - - inverseIndex[j] || i > inverseIndex[in->col - ] - inverseIndex[j]) // 跳过两边呈三角形的无效元素
continue;
out->data[i * in->col + inverseIndex[j] - in->row + ] = in->data[i * in->col + j]; // 利用 inverseIndex 来找钙元素在原距震中的位置
}
}
free(inverseIndex);
return out;
} // 各种形式的矩阵和一个向量的乘法
int dotCPU(const MAT *a, const MAT *x, MAT *y) // CPU MAT 乘法
{
checkNULL(a); checkNULL(x); checkNULL(y);
if (a->col != x->row)
{
printf("dotMATCPU dimension mismatch!\n");
return ;
} y->row = a->row;
y->col = x->col;
for (int i = ; i < a->row; i++)
{
format sum = ;
for (int j = ; j < a->col; j++)
sum += a->data[i * a->col + j] * x->data[j];
y->data[i] = sum;
}
COUNT_MAT(y);
return ;
} int dotCPU(const CSR *a, const MAT *x, MAT *y) // CPU CSR 乘法
{
checkNULL(a); checkNULL(x); checkNULL(y);
if (a->col != x->row)
{
printf("dotCSRCPU dimension mismatch!\n");
return ;
} y->row = a->row;
y->col = x->col;
for (int i = ; i < a->row; i++) // i 遍历 ptr,j 遍历行内数据,A 中为 0 的元素不参加乘法
{
format sum = ;
for (int j = a->ptr[i]; j < a->ptr[i + ]; j++)
sum += a->data[j] * x->data[a->index[j]];
y->data[i] = sum;
}
COUNT_MAT(y);
return ;
} int dotCPU(const ELL *a, const MAT *x, MAT *y) // CPU ELL乘法
{
checkNULL(a); checkNULL(x); checkNULL(y);
if (a->colOrigin != x->row)
{
printf("dotELLCPU dimension mismatch!\n");
return ;
} y->row = a->col;
y->col = x->col;
for (int i = ; i<a->col; i++)
{
format sum = ;
for (int j = ; j < a->row; j++)
{
int temp = a->index[j * a->col + i];
if (temp < ) // 跳过无效元素
continue;
sum += a->data[j * a->col + i] * x->data[temp];
}
y->data[i] = sum;
}
COUNT_MAT(y);
return ;
} int dotCPU(const COO *a, const MAT *x, MAT *y)// CPU COO乘法
{
checkNULL(a); checkNULL(x); checkNULL(y);
if (a->col != x->row)
{
printf("dotCOOCPU null!\n");
return ;
} y->row = a->row;
y->col = x->col;
for (int i = ; i<a->count; i++)
y->data[a->rowIndex[i]] += a->data[i] * x->data[a->colIndex[i]];
COUNT_MAT(y);
return ;
} int dotCPU(const DIA *a, const MAT *x, MAT *y)// CPU DIA乘法
{
checkNULL(a); checkNULL(x); checkNULL(y);
if (a->colOrigin != x->row)
{
printf("dotDIACPU null!\n");
return ;
}
y->row = a->row;
y->col = x->col;
int * inverseIndex = (int *)malloc(sizeof(int) * a->col);
for (int i = , j = ; i < a->row + a->col - ; i++)
{
if (a->index[i] == )
{
inverseIndex[j] = i;
j++;
}
}
for (int i = ; i < a->row; i++)
{
format sum = ;
for (int j = ; j < a->col; j++)
{
if (i < a->row - - inverseIndex[j] || i > inverseIndex[a->col - ] - inverseIndex[j])
continue;
sum += a->data[i * a->col + j] * x->data[i + inverseIndex[j] - a->row + ];
}
y->data[i] = sum;
}
COUNT_MAT(y);
free(inverseIndex);
return ;
} __global__ void dotGPU(const MAT *a, const MAT *x, MAT *y)// GPU MAT乘法
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < a->row)
{
format sum = ;
for (int i = ; i < a->col; i++)
sum += a->data[id * a->col + i] * x->data[i];
y->data[id] = sum;
}
if (id == )
{
y->row = a->row;
y->col = x->col;
COUNT_MAT(y);
}
return;
} __global__ void dotGPU(const CSR *a, const MAT *x, MAT *y)// GPU CSR乘法
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < a->row)
{
format sum = ;
for (int j = a->ptr[id]; j < a->ptr[id + ]; j++)
sum += a->data[j] * x->data[a->index[j]];
y->data[id] = sum;
}
if (id == )
{
y->row = a->row;
y->col = x->col;
COUNT_MAT(y);
}
return;
} __global__ void dotGPU(const ELL *a, const MAT *x, MAT *y)// GPU ELL乘法
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < a->col)
{
format sum = ;
for (int j = ; j < a->row; j++)
sum += a->data[id + j * a->col] * (a->index[id + j * a->col] < ? : x->data[a->index[id + j * a->col]]);
y->data[id] = sum;
}
if (id == )
{
y->row = a->col;
y->col = x->col;
COUNT_MAT(y);
}
return;
} __global__ void dotGPU(const COO *a, const MAT *x, MAT *y)// GPU COO乘法
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < a->count)
atomicAdd(&(y->data[a->rowIndex[id]]), a->data[id] * x->data[a->colIndex[id]]);
if (id == )
{
y->row = a->row;
y->col = x->col;
COUNT_MAT(y);
}
return;
} int test()// 测试矩阵打印和CPU计算的函数
{
const int row = , col = ; MAT* a = initializeMAT(row, col);
a->data[] = , a->data[] = , a->data[] = , a->data[] = ;
a->data[] = , a->data[] = , a->data[] = , a->data[] = ;
COUNT_MAT(a); MAT* x = initializeMAT(col, );
for (int i = ; i < x->row; i++)
x->data[i] = i + ;
COUNT_MAT(x); MAT* y = initializeMAT(row, );
COUNT_MAT(y); print("MAT a =", a);
print("MAT x =", x);
dotCPU(a, x, y);
print("MAT y = a * x = ", y); CSR* b = MATToCSR(a);
print("CSR b = a = ", b);
memset(y->data, , sizeof(format) * y->row * y->col);
dotCPU(b, x, y);
print("MAT y = b * x = ", y);
MAT* c = CSRToMAT(b);
print("MAT c = b =", c);
freeMatrix(b); ELL* d = MATToELL(a);
print("ELL d = a =", d);
memset(y->data, , sizeof(format) * y->row * y->col);
dotCPU(d, x, y);
print("MAT y = d * x =", y);
c = ELLToMAT(d);
print("MAT c = d =", c);
freeMatrix(d); COO* e = MATToCOO(a);
print("ELL e = a = ", e);
memset(y->data, , sizeof(format) * y->row * y->col);
dotCPU(e, x, y);
print("MAT y = e * x = ", y);
c = COOToMAT(e);
print("MAT c = e =", c);
freeMatrix(e); DIA* f = MATToDIA(a);
print("DIA f = a = ", f);
memset(y->data, , sizeof(format) * y->row * y->col);
dotCPU(f, x, y);
print("MAT y = f * x = ", y);
c = DIAToMAT(f);
print("MAT c = f =", c);
freeMatrix(f); freeMatrix(a);
freeMatrix(x);
freeMatrix(y);
freeMatrix(c);
return ;
} int main()
{
test(); clock_t timeCPU;
cudaEvent_t startGPU, stopGPU;
float timeGPU;
cudaEventCreate(&startGPU);
cudaEventCreate(&stopGPU); printf("\n\tStart. Matrix dimension:\t%d × %d", ROW, COL); // 初始化
timeCPU = clock();
aMAT = initializeMAT(ROW, COL);
xMAT = initializeMAT(COL, );
yMATRef = initializeMAT(ROW, );
yMATCal = initializeMAT(ROW, ); srand(SEED);
#ifdef INT
for (int i = ; i < COL * ROW; i++)
aMAT->data[i] = ((float)rand() < RAND_MAX * RATIO) ? (rand() - RAND_MAX / ) % : ;
COUNT_MAT(aMAT);
int count = aMAT->count;
for (int i = ; i < COL; i++)
xMAT->data[i] = i % ;
COUNT_MAT(xMAT);
#else
for (int i = ; i < COL * ROW; i++)
aMAT->data[i] = ((float)rand() < RAND_MAX * RATIO) ? ((float)rand() / RAND_MAX) : ;
aMAT->count = countMAT(aMAT);
for (int i = ; i < COL; i++)
xMAT->data[i] = ((float)rand() / RAND_MAX);
xMAT->count = countMAT(xMAT);
#endif
printf("\n\tInitialized. Time:\t\t%d ms\n", clock() - timeCPU); //CPU计算部分
//MAT 格式
timeCPU = clock();
dotCPU(aMAT, xMAT, yMATRef);
timeCPU = clock() - timeCPU;
printf("\n\tdotMATCPU time:\t%8.3f ms\n", (float)timeCPU); // CSR格式
aCSR = MATToCSR(aMAT);
timeCPU = clock();
dotCPU(aCSR, xMAT, yMATCal);
timeCPU = clock() - timeCPU;
printf("\n\tdotCSRCPU time:\t%8.3f ms\n", (float)timeCPU);
checkEqual(yMATRef->data, yMATCal->data, ROW); // ELL格式
aELL = MATToELL(aMAT);
timeCPU = clock();
dotCPU(aELL, xMAT, yMATCal);
timeCPU = clock() - timeCPU;
printf("\n\tdotELLCPU time:\t%8.3f ms\n", (float)timeCPU);
checkEqual(yMATRef->data, yMATCal->data, ROW); // COO格式
aCOO = MATToCOO(aMAT);
timeCPU = clock();
dotCPU(aCOO, xMAT, yMATCal);
timeCPU = clock() - timeCPU;
printf("\n\tdotCOOCPU time:\t%8.3f ms\n", (float)timeCPU);
checkEqual(yMATRef->data, yMATCal->data, ROW); // GPU计算部分
// MAT格式
cudaMemset(yMATCal->data, , sizeof(format) * yMATCal->row * yMATCal->col);
yMATCal->count = ;
cudaEventRecord(startGPU, );
dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aMAT, xMAT, yMATCal);
cudaDeviceSynchronize();
cudaEventRecord(stopGPU, );
cudaEventSynchronize(stopGPU);
cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
printf("\n\tdotMATGPU time:\t%8.3f ms\n", timeGPU);
checkEqual(yMATRef->data, yMATCal->data, ROW);
freeMatrix(aMAT); // CSR格式
cudaMemset(yMATCal->data, , sizeof(format) * yMATCal->row * yMATCal->col);
yMATCal->count = ;
cudaEventRecord(startGPU, );
dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aCSR, xMAT, yMATCal);
cudaDeviceSynchronize();
cudaEventRecord(stopGPU, );
cudaEventSynchronize(stopGPU);
cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
printf("\n\tdotCSRGPU time:\t%8.3f ms\n", timeGPU);
checkEqual(yMATRef->data, yMATCal->data, ROW);
freeMatrix(aCSR); // ELL格式
cudaMemset(yMATCal->data, , sizeof(format) * yMATCal->row * yMATCal->col);
yMATCal->count = ;
cudaEventRecord(startGPU, );
dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aELL, xMAT, yMATCal);
cudaDeviceSynchronize();
cudaEventRecord(stopGPU, );
cudaEventSynchronize(stopGPU);
cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
printf("\n\tdotELLGPU time:\t%8.3f ms\n", timeGPU);
checkEqual(yMATRef->data, yMATCal->data, ROW);
freeMatrix(aELL); // COO格式
cudaMemset(yMATCal->data, , sizeof(format) * yMATCal->row * yMATCal->col);
yMATCal->count = ;
cudaEventRecord(startGPU, );
dotGPU << <CEIL(count, THREAD_SIZE), THREAD_SIZE >> > (aCOO, xMAT, yMATCal);
cudaDeviceSynchronize();
cudaEventRecord(stopGPU, );
cudaEventSynchronize(stopGPU);
cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
printf("\n\tdotCOOGPU time:\t%8.3f ms\n", timeGPU);
checkEqual(yMATRef->data, yMATCal->data, ROW);
freeMatrix(aCOO); // 清理内存
freeMatrix(xMAT);
freeMatrix(yMATRef);
freeMatrix(yMATCal); getchar();
return ;
}

● 输出结果

MAT a =
MAT,
row = , col = , count = MAT x =
MAT,
row = , col = , count = MAT y = a * x =
MAT,
row = , col = , count = CSR b = a =
CSR,
row = , col = MAT y = b * x =
MAT,
row = , col = , count = MAT c = b =
MAT,
row = , col = , count = ELL d = a =
ELL,
row = , col = , colOrigin = - MAT y = d * x =
MAT,
row = , col = , count = MAT c = d =
MAT,
row = , col = , count = ELL e = a =
COO,
row = , col = , count =
(,,)
(,,)
(,,)
(,,)
(,,)
(,,)
(,,)
(,,) MAT y = e * x
MAT,
row = , col = , count = MAT c = e =
MAT,
row = , col = , count = Start. Matrix dimension: ×
Initialized. Time: 0.000 ms dotMATCPU time: 304.000 ms dotCSRCPU time: 3.000 ms Check output, passed. dotCELLPU time: 103.000 ms Check output, passed. dotCOOCPU time: 11.000 ms Check output, passed. dotMATGPU time: 5.133 ms Check output, passed. dotCSRGPU time: 2.263 ms Check output, passed. dotELLGPU time: 1.253 ms Check output, passed. dotCOOGPU time: 4.754 ms Check output, passed.

稀疏矩阵 part 5的更多相关文章

  1. [LeetCode] Sparse Matrix Multiplication 稀疏矩阵相乘

    Given two sparse matrices A and B, return the result of AB. You may assume that A's column number is ...

  2. 转载:稀疏矩阵存储格式总结+存储效率对比:COO,CSR,DIA,ELL,HYB

    http://www.cnblogs.com/xbinworld/p/4273506.html 稀疏矩阵是指矩阵中的元素大部分是0的矩阵,事实上,实际问题中大规模矩阵基本上都是稀疏矩阵,很多稀疏度在9 ...

  3. poj 3735 Training little cats 矩阵快速幂+稀疏矩阵乘法优化

    题目链接 题意:有n个猫,开始的时候每个猫都没有坚果,进行k次操作,g x表示给第x个猫一个坚果,e x表示第x个猫吃掉所有坚果,s x y表示第x个猫和第y个猫交换所有坚果,将k次操作重复进行m轮, ...

  4. 稀疏矩阵存储格式总结+存储效率对比:COO,CSR,DIA,ELL,HYB

    稀疏矩阵是指矩阵中的元素大部分是0的矩阵,事实上,实际问题中大规模矩阵基本上都是稀疏矩阵,很多稀疏度在90%甚至99%以上.因此我们需要有高效的稀疏矩阵存储格式.本文总结几种典型的格式:COO,CSR ...

  5. C语言 稀疏矩阵 压缩 实现

    稀疏矩阵压缩存储的C语言实现 (GCC编译). /** * @brief C语言 稀疏矩阵 压缩 实现 * @author wid * @date 2013-11-04 * * @note 若代码存在 ...

  6. 三元组表压缩存储稀疏矩阵实现稀疏矩阵的快速转置(Java语言描述)

    三元组表压缩存储稀疏矩阵实现稀疏矩阵的快速转置(Java语言描述) 用经典矩阵转置算法和普通的三元组矩阵转置在时间复杂度上都是不乐观的.快速转置算法在增加适当存储空间后实现快速转置具体原理见代码注释部 ...

  7. 稀疏矩阵乘法加法等的java实现

    原创声明:本文系作者原创,转载请写明出处. 一.前言       前几天由于科研需要,一直在搞矩阵的稀疏表示的乘法,不过最近虽然把程序写出来了,还是无法处理大规模的矩阵(虽然已经是稀疏了).原因可能是 ...

  8. Matlab稀疏矩阵

    一.矩阵存储方式 MATLAB的矩阵有两种存储方式,完全存储方式和稀疏存储方式 1.完全存储方式 将矩阵的全部元素按列存储,矩阵中的全部零元素也存储到矩阵中. 2.稀疏存储方式 仅存储矩阵所有的非零元 ...

  9. matlab——sparse函数和full函数(稀疏矩阵和非稀疏矩阵转换)

    函数功能:生成稀疏矩阵 使用方法 :S = sparse(A) 将矩阵A转化为稀疏矩阵形式,即矩阵A中任何0元素被去除,非零元素及其下标组成矩阵S.如果A本身是稀疏的,sparse(S)返回S. S ...

  10. 稀疏矩阵coo_matrix的乘法

    稀疏矩阵的乘法在做基于n-gram的分类的时候还是相当有用的,但是由于网上资料太少,所以折腾了几天才算折腾出来. 首先scipy包里常见的稀疏矩阵有三种形式, coo_matrix, csr_matr ...

随机推荐

  1. 我的代码-sql query

    # coding: utf-8 # In[ ]: WITH List AS (SELECT e.*,f.* FROM( SELECT DISTINCT c.lot_id, c.wafer_key,LE ...

  2. mvc框架模式

    首先分为3个板块 路由的api相当于一个域名. 根据当前地址在执行路由里的代码; 逻辑层: 书写业务逻辑的都代码都放在controller层 数据处理层: model 写数据的增删改查方法,导出一般供 ...

  3. CodeForce 517 Div 2. C Cram Time

    http://codeforces.com/contest/1072/problem/C C. Cram Time time limit per test 1 second memory limit ...

  4. debian-pve-ceph

    从头开始安装debian9.8 debian-9.8.0-amd64-netinst.iso vi /etc/vim/vimrc.tinyset nocompatibleset backspace=2 ...

  5. springboot 静态资源配置

    package com.linmingliang.myblog.interceptor; import org.springframework.stereotype.Component;import ...

  6. 我发起了一个 用 javascript 写一个 Office 的 开源项目 JS Office

    用   js  写   Office,  说实在的,  把 现在已有的 各种 富文本编辑器 和 电子表格 js 库 收集起来 整合一下 就 差不多了 , 放到 前几天 那个 “  js 作为 一等公民 ...

  7. 2、Linux安装jmeter

    1.下载地址 http://jmeter.apache.org/download_jmeter.cgi 2.选择下载,本地选择,Source版会比较坑 3.安装jdk8,过程省略,可参考:https: ...

  8. Dubbo-Admin 2.6.0使用

    一.下载源码 下载2.6.0的源码 https://github.com/apache/incubator-dubbo/releases/tag/dubbo-2.6.0 二.使用Eclipse打开du ...

  9. Beyond Compare 3添加右键菜单

    目前是在Beyond Compare 3.1.9版本上试验可行,其他版本上尚未测试. 添加右键菜单步骤: 1.新建为.bat后缀的文本,将下面“添加右键菜单批处理”复制到此文本中. 2.将批处理移动到 ...

  10. 采用Anaconda平台调用pymc3时出现错误的解决方法

    提示:(1)module 'theano' has no attribute 'gof',c++编辑出现错误 (2)stdio.h file not found 解决方法:(1)在终端中输入 xcod ...