Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 1

▶ 第三章，逐步优化了一个二维卷积计算的过程

● 基准代码

 #include <stdio.h>

 #include <stdlib.h>

 #include <string.h>

 #include <math.h>

 #include <time.h>

 #include <sys/time.h>

 #include <omp.h>

 #include <assert.h>

 #include <sys/mman.h>

 #define REAL    double

 #define WIDTH   1030

 #define HEIGHT  2048

 #define COUNT   1000

 #define PAD64   0

 #if PAD64                                                                   // 是否对齐到 64 Byte

     #define WIDTHP ((WIDTH * sizeof(REAL) + 63) / 64 * 64 / sizeof(REAL))

 #else

     #define WIDTHP WIDTH

 #endif

 void initbuf(REAL *fbuf, const int width, const int height)                 // 初始化矩阵

 {

     for (int y = ; y < height; y++)

     {

         REAL val = (y % ) ?  : 1.0;

         for (int x = ; x < width; x++)

             fbuf[y * WIDTHP + x] = val;

     }

     return;

 }

 // 重复计算模糊 count 次

 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height,

     const REAL ctr, const REAL next, const REAL diag, const int count)

 {

     REAL *fin = finp, *fout = foutp;

     for (int i = ; i < count; i++)

     {

         for (int y = ; y < height - ; y++)    // 不处理光环元素

         {

             int c =  + y * WIDTHP;         // ？为什么要加两次 1

             int n = c - WIDTHP, s = c + WIDTHP, w = c - , e = c + ;

             int nw = n - , ne = n + , sw = s - , se = s + ;

             for (int x = ; x < width - ; x++)

             {

                 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] +

                     next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c];

                 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++;

             }

         }

         REAL *ftmp = fin;

         fin = fout;

         fout = ftmp;

     }

     return;

 }

 static double dtime()

 {

     double tseconds = 0.0;

     struct timeval mytime;

     gettimeofday(&mytime, (struct timezone *) );

     tseconds = (double)(mytime.tv_sec + (double)mytime.tv_usec * 1.0e-6);

     return (tseconds);

 }

 int main(int argc, char *argv[])

 {

     REAL *fa = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT), *fb = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT);

     assert(fa != MAP_FAILED);

     assert(fb != MAP_FAILED);

     printf("Initializing..%d Threads, %d x %d, PAD=%d..\n\n", omp_get_num_threads(), WIDTH, HEIGHT, WIDTHP);

     initbuf(fa, WIDTHP, HEIGHT);

     initbuf(fb, WIDTHP, HEIGHT);

     printf("Running stencil kernel %d times\n", COUNT);

     const REAL stendiag = 0.00125, stennext = 0.00125, stenctr = 0.99;

     double time_b, time_e;

     time_b = dtime();

     stencil9pt(fa, fb, WIDTHP, HEIGHT, stenctr, stennext, stendiag, COUNT);

     time_e = dtime();

     printf("Elapsed time : %.3f (s)\n", time_e - time_b);

     printf("FLOPS: %.3f (MFlops)\n", (WIDTHP * HEIGHT) * 17.0 * COUNT / (time_e - time_b) * 1.0e-06);// 计算一个元素需要 17 次乘法或加法

     free(fa);

     free(fb);

     return ;

 }

■ 输出结果

Xeon:

Running stencil kernel  times

Elapsed time: 5.754 (s)

FLOPS       : 6232.567 (MFlops)

XeonPhi:

Running stencil kernel  times

Elapsed time: 102.042 (s)

FLOPS       : 351.428 (MFlops)

● 修改计算函数，忽略指针依赖关系

 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height,

     const REAL ctr, const REAL next, const REAL diag, const int count)

 {

     REAL *fin = finp, *fout = foutp;

     for (int i = ; i<count; i++)

     {

         for (int y = ; y < height - ; y++)

         {

             int c =  + y * WIDTHP;

             int n = c - WIDTHP, s = c + WIDTHP, w = c - , e = c + ;

             int nw = n - , ne = n + , sw = s - , se = s + ;

             #pragma ivdep                       // 忽略不明显的（指针）依赖关系

             for (int x = ; x < width - ; x++)

             {

                 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] +diag * fin[se] +

                     next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c];

                 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++;

             }

         }

         REAL *ftmp = fin;

         fin = fout;

         fout = ftmp;

     }

     return;

 }

■ 输出结果（Xeon Phi）

Running stencil kernel  times

Elapsed time: 24.052 (s)

FLOPS       : 1490.925 (MFlops)

● OpenMP

 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height,

     const REAL ctr, const REAL next, const REAL diag, const int count)

 {

     REAL *fin = finp, *fout = foutp;

     for (int i = ; i<count; i++)

     {

         int y, x;

 #pragma omp parallel for private(x)             // 一句 OpenMp，注意 x 线程私有

         for (y = ; y < height - ; y++)

         {

             int c =  + y * WIDTHP;

             int n = c - WIDTHP, s = c + WIDTHP, w = c - , e = c + ;

             int nw = n - , ne = n + , sw = s - , se = s + ;

 #pragma ivdep

             for (x = ; x < width - ; x++)

             {

                 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] +

                     next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c];

                 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++;

             }

         }

         REAL *ftmp = fin;

         fin = fout;

         fout = ftmp;

     }

     return;

 }

■ 输出结果（Xeon Phi）

Running stencil kernel  times

Elapsed time : 0.700 (s)

FLOPS        : 51201.595 (MFlops)

● 数组对齐

 // 打开 PAD64

 #define PAD64   1

 // 替换 malloc 和 free

     REAL *fa = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT);

     REAL *fb = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT);

     ...

     free(fa);

     free(fb);

 // 替换为

     REAL *fa = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, );

     REAL *fb = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, );

     ...

     _mm_free(fa);

     _mm_free(fb);

■ 输出结果（Xeon Phi）

Running stencil kernel  times

Elapsed time : 0.651 (s)

FLOPS        : 55155.399 (MFlops)

● 流存储

 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height,

     const REAL ctr, const REAL next, const REAL diag, const int count)

 {

     REAL *fin = finp, *fout = foutp;

     for (int i = ; i<count; i++)

     {

         int y, x;

 #pragma omp parallel for private(x)

         for (y = ; y < height - ; y++)

         {

             int c =  + y * WIDTHP;

             int n = c - WIDTHP, s = c + WIDTHP, w = c - , e = c + ;

             int nw = n - , ne = n + , sw = s - , se = s + ;

 #pragma ivdep

 #pragma vector nontemporal                      // 使用流存储指令（或使用编译选项 -opt-streaming-storesalways）

             for (x = ; x < width - ; x++)

             {

                 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] +

                     next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c];

                 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++;

             }

         }

         REAL *ftmp = fin;

         fin = fout;

         fout = ftmp;

     }

     return;

 }

■ 输出结果

Xeon:

Running stencil kernel  times

Elapsed time: 1.905 (s)

FLOPS       : 18824.053 (MFlops)

XeonPhi:

Running stencil kernel  times

Elapsed time: 0.639 (s)

FLOPS       : 56145.417 (MFlops)

● 2 MB 存储页（书中代码没有 munmap 的部分）。参考（https://www.aliyun.com/jiaocheng/208004.html），但是服务器上编译时找不到 MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB 等定义，无法使用

 // 替换 _mm_malloc 和 _mm_free

     REAL *fa = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, );

     REAL *fb = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, );

     ...

     _mm_free(fa);

     _mm_free(fb);

 // 替换为

     REAL *fa = (REAL *)mmap(, WIDTHP*HEIGHT * sizeof(REAL), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -, );

     REAL *fb = (REAL *)mmap(, WIDTHP*HEIGHT * sizeof(REAL), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -, );

     ...

     munmap(fa, WIDTHP*HEIGHT * sizeof(REAL));

     munmap(fb, WIDTHP*HEIGHT * sizeof(REAL));

Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 1的更多相关文章

Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 4
▶ 第五章,几个优化 ● 代码 #include <stdio.h> #include <stdlib.h> #include <math.h> #define S ...
Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 3
▶ 第二章,几个简单的程序 ● 代码,单线程 #include <stdio.h> #include <stdlib.h> #include <string.h> ...
Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 2
▶ 第四章,逐步优化了一个三维卷积计算的过程 ● 基准代码 #include <stdio.h> #include <stdlib.h> #include <string ...
Xeon Phi 编程备忘
▶ 闲鱼的 Xeon Phi 3120A 配办公室的新 Xeon 服务器,记录一下环境安装过程. ● 原本尝试搭 Ubuntu 服务器,参考[https://software.intel.com/en ...
Python猫荐书系列之五：Python高性能编程
稍微关心编程语言的使用趋势的人都知道,最近几年,国内最火的两种语言非 Python 与 Go 莫属,于是,隔三差五就会有人问:这两种语言谁更厉害/好找工作/高工资…… 对于编程语言的争论,就是猿界的生 ...
《高性能javascript》一书要点和延伸（上）
前些天收到了HTML5中国送来的<高性能javascript>一书,便打算将其做为假期消遣,顺便也写篇文章记录下书中一些要点. 个人觉得本书很值得中低级别的前端朋友阅读,会有很多意想不到的 ...
高质量C++/C编程指南（林锐）
推荐-高质量C++/C编程指南(林锐) 版本/状态作者参与者起止日期备注 V 0.9 草稿文件林锐 2001-7-1至 2001-7-18 林锐起草 V 1.0 正式文件林锐 20 ...
物联网操作系统HelloX应用编程指南
HelloX操作系统应用编程指南 HelloX应用开发概述可以通过三种方式,在HelloX操作系统基础上开发应用: 1．以内部命令方式实现应用,直接编译链接到HelloX的内核she ...
JDK 高性能编程之容器
高性能编程在对不同场景下对于容器的选择有着非常苛刻的条件,这里记录下前人总结的经验,并对源码进行调试 JDK高性能编程之容器读书笔记内容部分来源书籍深入理解JVM.互联网等先放一个类图util,点 ...

随机推荐

php解析Excel表格并且导入MySQL数据库
最近根据客户需求,需要增加一个导入Excel表格的功能,Excel中存放的是知识库中医知识的分类体系目录.是在thinkphp框架下编写的代码,用的是phpexcel第三方包.测试环境用的是xampp ...
keil的51项目创建
keil的51项目创建步骤: 工程创建: Project->New uVision Project 项目命名:如...test CPU->Atmel::AT89C51 文件创建: File ...
成员变量位置获取url
L2-007. 家庭房产（并查集）*
L2-007. 家庭房产参考博客 #include <iostream> #include <cstdio> #include <cstring> #includ ...
使用vue+koa实现一个简单的图书小程序（1）
这个系列的博客用来记录我开发时候遇到的问题以及学习到的知识边做边学: 前后端分离,高内聚低耦合小程序端使用了mpvue 内部使用了vuejs的语法来做整个小程序的渲染层后端使用的是koa2搭建一 ...
leetcode 421.Maximum XOR of Two Numbers in an Array
题目中给定若干个数,然后任意选定两个数使得其异或值最大. 先利用样例中的: 3 10 5 25 2 8 这些数转换为二进制来看的话那么是先找到最高位的1然后与数组中其他的数相与后的数值保存到set中去 ...
UVa699
这个建树的根选的很有意思,在中间作为树的根.所以二叉树建树的方法虽然一般是有两种数组的方法,一个是如果深度不太大的话,可以之间用2*k+1,2*k建树,如果很大的话,就挨着建树,弄一个结构体,有左右子 ...
mysql查询出现In aggregated query without GROUP BY, expression #1 of SELECT list contains nonaggregated column 'zhibo.a.id';
出现问题: Error querying database. Cause: com.mysql.jdbc.exceptions.jdbc4.MySQLSyntaxErrorException: In ...
学习笔记TF040:多GPU并行
TensorFlow并行,模型并行,数据并行.模型并行根据不同模型设计不同并行方式,模型不同计算节点放在不同硬伯上资源运算.数据并行,比较通用简便实现大规模并行方式,同时使用多个硬件资源计算不同bat ...
使用Log4net 日志系统
官方文档 http://logging.apache.org/log4net/release/config-examples.html C# 项目中直接使用nuget,下载Apache的log4net ...

Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 1

Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 1的更多相关文章

随机推荐

热门专题