OpenACC 书上的范例代码（Jacobi 迭代），part 1

▶ 使用Jacobi 迭代求泊松方程的数值解

● 原始串行版本

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #if defined(_WIN32) || defined(_WIN64)                                                      // 统一计时器

 #include <C:\Program Files\PGI\win64\19.4\include\wrap\sys\timeb.h>

 #define gettime(a)  _ftime(a)

 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm))        // 单位 ms

 typedef struct _timeb timestruct;

 #else

 #include <sys/time.h>

 #define gettime(a)  gettimeofday(a, NULL)

 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)   // 单位 us

 typedef struct timeval timestruct;

 #endif

 #define IMPROV                                          // 是否额外使用 “每次计算的修正量” 作为退出循环的条件

 inline float uval(float x, float y)                     // 求该点到原点距离的平方

 {

     return x * x + y * y;

 }

 int main()

 {

     const int row = , col = ;                   // 网格行数和列数，

     const float height = 1.0, width = 2.0;              // 实际高度和宽度，与网格行列数不成比例说明是矩形网格

     const float hx = height / row, wy = width / col;    // 每个网格的高度和宽度

     const float fij = -4.0f;                            // 函数 f(x,y) = -4，此时方程的解为 z = x^2 + y^2

     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));// 其他用到的参数

     const int maxIter = ;                            // 最大迭代次数

     const int colPlus = col + ;                        // 实际列数

 #ifdef IMPROV

     const float errControl = 0.0f;                      // 修正量控制，取 0 表示无用

     float err = 0.0f;                                   // 修正量

 #endif

     float *u0 = (float *)malloc(sizeof(float)*(row + )*colPlus);       // 用来存放网格数据的两张表，行列数等于 row 和 col 各自加 1，

     float *u1 = (float *)malloc(sizeof(float)*(row + )*colPlus);

     float *utemp = NULL;                                                // 用于交换 u1 和 u0 的临时指针    

     // 初始化边界为 g(x,y) = x^2+y^2

     for (int ix = ; ix <= row; ix++)                                   // 左右边界

     {

         u0[ix*colPlus + ] = u1[ix*colPlus + ] = uval(ix * hx, 0.0f);

         u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);

     }

     for (int jy = ; jy <= col; jy++)                                   // 上下边界

     {

         u0[jy] = u1[jy] = uval(0.0f, jy * wy);

         u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);

     }

     for (int ix = ; ix < row; ix++)                                    // 内部格点初始化为 0.0f

     {

         for (int jy = ; jy < col; jy++)

             u0[ix*colPlus + jy] = 0.0f;

     }

     // 计算

     timestruct t1, t2;

     gettime(&t1);

     for (int iter = ; iter < maxIter; iter++)

     {

         for (int ix = ; ix < row; ix++)

         {

             for (int jy = ; jy < col; jy++)

             {

                 u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - )*colPlus + jy] + u0[(ix + )*colPlus + jy]) + \

                     hx2 * (u0[ix*colPlus + jy - ] + u0[ix*colPlus + jy + ])) * c2;

 #ifdef IMPROV

                 err = max(fabs(u0[ix*colPlus + jy] - u1[ix*colPlus + jy]), err);  // 记录整张表上的最大修正量

 #endif

             }

         }

 #ifdef IMPROV

         //printf("\niter = %d, err = %e\n", iter, err);                 // 逐次输出

         if (err < errControl)                                           // 修正量小于指定量就可以退出

             break;

 #endif

         utemp = u0, u0 = u1, u1 = utemp;                                // 交换指针

     }

     gettime(&t2);

     long long timeElapse = usec(t1, t2);

     printf("\nElapsed time: %13ld ms.\n", timeElapse);

     free(u0);

     free(u1);

     getchar();

     return ;

 }

● 输出结果（使用 IMPROV），可以看到很多 not fused，这都是可以改进的地方

D:\Code\OpenACC>pgcc main.c -Minfo -o main.exe                      // 普通编译

main:

     , FMA (fused multiply-add) instruction(s) generated          // 使用乘加指令

uval:

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>pgcc main.c -Minfo -o main-fast.exe -fast           // 添加 fast 选项

main:

     , uval inlined, size= (inline) file main.c ()             // 4 个内联函数

          , Loop not fused: different loop trip count             // 担心 for 中存在数据依赖，拒绝并行

              Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed  // 担心 u0 和 u1是否重叠，拒绝并行

              Loop not vectorized: data dependency

              Loop unrolled  times  // 循环展开

              Generated  prefetches in scalar loop

     , uval inlined, size= (inline) file main.c ()

     , uval inlined, size= (inline) file main.c ()

          , Loop not vectorized: data dependency

              Loop unrolled  times

     , uval inlined, size= (inline) file main.c ()

     , Memory zero idiom, loop replaced by call to __c_mzero4     // 使用 memcpy 来赋零值

     , Loop not vectorized/parallelized: potential early exits    // 有额外脱离循环的条件，拒绝并行

     , Loop not vectorized: data dependency

         Loop unrolled  times

         FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>main.exe

Elapsed time:           ms.

D:\Code\OpenACC>main-fast.exe

Elapsed time:           ms.                                     // 加了 fast 反而更慢

● 输出结果（不用 IMPROV），发现变快了，可见提前跳出循环的 if 语句对并行化有很大影响。在本例中我们让 errControl = 0，每次循环多一个判断（实际绝对不会跳出），就严重干扰了编译

D:\Code\OpenACC>pgcc main.c -Minfo -o main.exe

main:

     , FMA (fused multiply-add) instruction(s) generated

uval:

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>pgcc main.c -Minfo -o main-fast.exe -fast

main:

     , uval inlined, size= (inline) file main.c ()

          , Loop not fused: different loop trip count

              Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed

              Loop not vectorized: data dependency

              Loop unrolled  times

              Generated  prefetches in scalar loop

     , uval inlined, size= (inline) file main.c ()

     , uval inlined, size= (inline) file main.c ()

          , Loop not vectorized: data dependency

              Loop unrolled  times

     , uval inlined, size= (inline) file main.c ()

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Loop not fused : function call before adjacent loop    // ？

     , Loop not vectorized : data dependency

      Loop unrolled  times                                     // 展开次数由 2 变成 4

         FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>main.exe

Elapsed time:           ms.                                 // 变快了 1 倍

D:\Code\OpenACC>main-fast.exe

Elapsed time:            ms.                                 // 再变快 1 倍

● 使用 OpenMP 优化（就一句导语）

 // #include <math.h> 下面

 #include <omp.h>

 //for (int iter = 1; iter < maxIter; iter++){ 下面

 #ifdef IMPROV

 #pragma omp parallel for reduction(max:err) default(none) shared(u0, u1, c1, c2, hx2, wy2, colPlus) private(err)

 #else

 #pragma omp parallel for default(none) shared(u0, u1, c1, c2, hx2, wy2, colPlus)

 #endif

● 输出结果

D:\Code\OpenACC>set OMP_NUM_THREADS=                           // 使用 4 个线程

D:\Code\OpenACC>pgcc main.c -Minfo -o main4I.exe -fast -mp      // 用 IMPROV

main:

     , uval inlined, size= (inline) file main.c ()

          , Loop not fused: different loop trip count

              Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed

              Loop not vectorized: data dependency

              Loop unrolled  times

              Generated  prefetches in scalar loop

     , uval inlined, size= (inline) file main.c ()

     , uval inlined, size= (inline) file main.c ()

          , Loop not vectorized: data dependency

              Loop unrolled  times

     , uval inlined, size= (inline) file main.c ()

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Loop not vectorized/parallelized: potential early exits

     , Parallel region activated                              // OpenMP 并行区

         Parallel loop activated with static block schedule

     , Loop not vectorized: data dependency

         Loop unrolled  times

         FMA (fused multiply-add) instruction(s) generated

     , Begin critical section                                 // 脱出循环的判断导致的串行区

         End critical section

         Barrier                                                // 栅栏

         Parallel region terminated                             

D:\Code\OpenACC>main4I.exe                                      

Elapsed time:            ms.                                 // 还是快了 3.8 倍

D:\Code\OpenACC>pgcc main.c -Minfo -o main4.exe -fast -mp       // 不用 IMPROV

main:

     , uval inlined, size= (inline) file main.c ()

          , Loop not fused: different loop trip count

              Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed

              Loop not vectorized: data dependency

              Loop unrolled  times

              Generated  prefetches in scalar loop

     , uval inlined, size= (inline) file main.c ()

     , uval inlined, size= (inline) file main.c ()

          , Loop not vectorized: data dependency

              Loop unrolled  times

     , uval inlined, size= (inline) file main.c ()

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Loop not vectorized/parallelized: contains a parallel region   // 有 OpenMP的并行区，拒绝并行

     , Parallel region activated

         Parallel loop activated with static block schedule

     , Loop not vectorized: data dependency

         Loop unrolled  times

         FMA (fused multiply-add) instruction(s) generated

     , Barrier                                                // 没有了串行区

         Parallel region terminated

D:\Code\OpenACC>main4.exe

Elapsed time:            ms.                                 // 还能再快点，加速比 1.4

D:\Code\OpenACC>set OMP_NUM_THREADS=                           // 使用 8 线程

D:\Code\OpenACC>pgcc main.c -Minfo -o main8.exe -fast -mp

...// 跟 4 线程时一模一样

D:\Code\OpenACC>main8.exe

Elapsed time:            ms.                                 // 不宰线性加速，加速比 1.5

▶ 在 Ubuntu 下跑的结果，加速前比 win10 慢很多，关闭 IMPROV 并开启 OpenMP 和 fast 选项后速度接近

mainI.exe            us

mainI-fast.exe       us  // 极速比 3.1

main.exe             us  // 加速比 2.1

main-fast.exe         us  // 加速比 6.4

cuan@CUAN:~$ pgcc mainI.c -Minfo -o main4I-fast.exe -fast -mp // 要求我将 row，col，fij 放入 OpenMP 的 shared 导语中，在 win10 下没有显式放入也行

PGC-S--row must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: )

PGC-S--col must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: )

PGC-S--fij must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: )

PGC/x86- Linux 19.4-: compilation completed with severe errors

main4I-fast.exe       us  

main4-fast.exe        us  // 加速比 8.8 

main8-fast.exe        us  // 不能继续线性加速

OpenACC 书上的范例代码（Jacobi 迭代），part 1的更多相关文章

OpenACC 书上的范例代码（Jacobi 迭代），part 3
▶ 使用Jacobi 迭代求泊松方程的数值解 ● 使用 data 构件,强行要求 u0 仅拷入和拷出 GPU 各一次,u1 仅拷入GPU 一次 #include <stdio.h> #in ...
OpenACC 书上的范例代码（Jacobi 迭代），part 2
▶ 使用Jacobi 迭代求泊松方程的数值解 ● 首次使用 OpenACC 进行加速,使用动态数组,去掉了误差控制 #include <stdio.h> #include <stdl ...
C#高级编程（第9版） -C#5.0&.Net4.5.1 书上的示例代码下载链接
http://www.wrox.com/WileyCDA/WroxTitle/Professional-C-5-0-and-NET-4-5-1.productCd-1118833031,descCd- ...
uva 213 - Message Decoding (我认为我的方法要比书上少非常多代码，不保证好……)
#include<stdio.h> #include<math.h> #include<string.h> char s[250]; char a[10][250] ...
java代码流类。。程序怎么跟书上的结果不一样？？？
总结:这个程序很容易懂.的那是这个结果我觉得有问题啊..怎么“stop”后,输出的内容是输入过的呢? 应该是没有关系的呀,与输入的值是不同的....怎么书上运行的结果和我的不一样啊 package c ...
面试必备：高频算法题终章「图文解析 + 范例代码」之矩阵二进制 + 位运算 + LRU 合集
Attention 秋招接近尾声,我总结了牛客.WanAndroid 上,有关笔试面经的帖子中出现的算法题,结合往年考题写了这一系列文章,所有文章均与 LeetCode 进行核对.测试.欢迎食用本 ...
JAVA理解逻辑程序的书上全部重要的习题
今天随便翻翻看以前学过JAVA理解逻辑程序的书上全部练习,为了一些刚学的学弟学妹,所以呢就把这些作为共享了. 希望对初学的学弟学妹有所帮助! 例子:升级“我行我素购物管理系统”,实现购物结算功能代码 ...
OK 开始实践书上的项目一：即使标记
OK 开始实践书上的项目一:及时标记然而....又得往前面看啦! ----------------------我是分割线------------------------ 代码改变世界
关于node的基础理论，书上看来的
最近看了一本书,说了一些Node.js的东西,现在来记录一下,让自己记得更牢靠一点. 在书上,是这样介绍的:Node.js模型是源于Ruby的Event Machine 和 Python的Twiste ...

随机推荐

Luogu 3245 大数
Luogu 3245 大数开始就想 $10$ 进制 $hash$ ,$Hash(r)\equiv Hash(l-1)\cdot 10^{r-l+1}$ ,感觉没什么美妙的性质啊... 然 ...
jquey学习2之jquery动态添加页面片段
第一个方法:append()方法 [1]$(selector).append(content)//向匹配的所有标签中的内容末尾处添加Html代码,会编译成页面显示. <html> < ...
jaeger 使用ElasticSearch 作为后端存储
jaeger 支持es 作为后端存储,这样对于查询.以及系统扩展是比较方便的使用docker-compose 运行环境准备参考项目: https://github.com/rongfenglia ...
hadoop之 node manager起不来，执行mapreduce 程序hang住
现象: node manager起不来, 执行mapreduce 程序hang住 namenode 进程状态查询[root@hadp-master sbin]# jps8608 ResourceMan ...
FineUI与百度地图简单示例（转帖）
http://www.fineui.com/bbs/forum.php?mod=viewthread&tid=4191&extra=page%3D1 前台代码 <%@ Page ...
JS怎么把字符串数组转换成整型数组
今天在学习highcharts时,遇到了一个把字符串数组转换为整形数组的问题,拿在这里讨论一下: 比如有一个字符串: var dataStr="1,2,3,4,5"; 现在需要把它 ...
ML(3.1): NavieBayes R_e1071
朴素贝叶斯方法是一种使用先验概率去计算后验概率的方法, 具体见ML(3): 贝叶斯方法 R包 ① e1071::e1071 ② klaR::klaR 参考资料:https://en.wikibooks ...
java jni 调用c++ opencv代码成功范例
java上建立接口定义 package com.dtk; public class Rec { public native String RecImage(String src); public st ...
Linux下Oracle中SqlPlus时上下左右键乱码问题的解决办法
window下的sqlplus可以通过箭头键,来回看历史命令,用起来非常的方便. 但是在Linux下,会出现各种乱码,非常不方便,如下图所示,每次打错一个字符就需要重新打一遍. 解决办法:rlwrap ...
System V 消息队列实例
前言: 消息队列是消息的链接表,存放在内核中,并由消息队列标识符标识.我们将称消息队列为 “队列”,其标识符为“队列I D”.msgget创建一个新队列或打开一个存在的队列; msgsnd向队列末端添 ...

OpenACC 书上的范例代码（Jacobi 迭代），part 1

OpenACC 书上的范例代码（Jacobi 迭代），part 1的更多相关文章

随机推荐

热门专题