Here is the code:

  1. #include <stdio.h>
  2. #include <xmmintrin.h>
  3. #include <windows.h>
  4.  
  5. typedef __m128 Vec;
  6.  
  7. typedef unsigned long long value_t;
  8.  
  9. __forceinline value_t now()
  10. {
  11. LARGE_INTEGER n;
  12. QueryPerformanceCounter(&n);
  13. return n.QuadPart;
  14. }
  15.  
  16. inline void img_transpose(
  17. Vec *dst_img,
  18. Vec *src_img,
  19. const int src_w,
  20. const int src_h)
  21. {
  22. #pragma omp parallel for
  23. for (int j = ; j < src_w; ++j)
  24. {
  25. for (int i = ; i < src_h; ++i)
  26. {
  27. dst_img[j * src_h + i] = src_img[i * src_w + j];
  28. }
  29. }
  30. }
  31.  
  32. inline void img_transpose_block(
  33. Vec *dst_img,
  34. Vec *src_img,
  35. const int src_w,
  36. const int src_h)
  37. {
  38. #pragma omp parallel for
  39. for (int j = ; j < src_w; j += )
  40. {
  41. for (int i = ; i < src_h; i += )
  42. {
  43. const int nsize = min(j + , src_w);
  44. const int msize = min(i + , src_h);
  45.  
  46. for (int n = j; n < nsize; ++n)
  47. {
  48. for (int m = i; m < msize; ++m)
  49. {
  50. dst_img[n * src_h + m] = src_img[m * src_w + n];
  51. }
  52. }
  53. }
  54. }
  55. }
  56.  
  57. int main(int argc, char *argv[])
  58. {
  59. //// performance benchmark ////
  60.  
  61. const int w = ;
  62. const int h = ;
  63. Vec *a = new Vec [w * h];
  64. Vec *b = new Vec [w * h];
  65. value_t start_time, end_time;
  66.  
  67. LARGE_INTEGER freq;
  68. QueryPerformanceFrequency(&freq);
  69. double ms_per_tick = 1000.0 / (double)freq.QuadPart;
  70.  
  71. start_time = now();
  72.  
  73. for (int t = ; t < ; ++t)
  74. {
  75. img_transpose(b, a, w, h);
  76. img_transpose(a, b, h, w);
  77. }
  78.  
  79. end_time = now();
  80. printf("img_transpose: %f ms\n", (double)(end_time - start_time) * ms_per_tick);
  81.  
  82. start_time = now();
  83.  
  84. for (int t = ; t < ; ++t)
  85. {
  86. img_transpose_block(b, a, w, h);
  87. img_transpose_block(a, b, h, w);
  88. }
  89.  
  90. end_time = now();
  91. printf("img_transpose_block: %f ms\n", (double)(end_time - start_time) * ms_per_tick);
  92.  
  93. delete [] a;
  94. delete [] b;
  95.  
  96. //// algorithm validation ////
  97. const int width = ;
  98. const int height = ;
  99. Vec *src_img = new Vec [width * height];
  100. Vec *dst_img = new Vec [height * width];
  101.  
  102. for (int j = ; j < height; ++j)
  103. {
  104. for (int i = ; i < width; ++i)
  105. {
  106. src_img[j * width + i].m128_i32[] = i;
  107. src_img[j * width + i].m128_i32[] = j;
  108. }
  109. }
  110.  
  111. img_transpose_block(dst_img, src_img, width, height);
  112.  
  113. for (int j = ; j < width; ++j)
  114. {
  115. for (int i = ; i < height; ++i)
  116. {
  117. int pi = dst_img[j * height + i].m128_i32[];
  118. int pj = dst_img[j * height + i].m128_i32[];
  119.  
  120. if (pi != j || pj != i)
  121. {
  122. printf("Algorithm is wrong!!!\n");
  123. goto END_OF_PROGRAM;
  124. }
  125. }
  126. }
  127.  
  128. END_OF_PROGRAM:
  129. printf("All done\n");
  130.  
  131. return ;
  132. }

A tiny program to benchmark image transpose algorithms的更多相关文章

  1. hey is a tiny program that sends some load to a web application.

    hey is a tiny program that sends some load to a web application. DOS attack DOS攻击生成 https://github.c ...

  2. 自己动手写一个编译器Tiny语言解析器实现

    然后,上一篇文章简介Tiny词法分析,实现语言.本文将介绍Tiny的语法分析器的实现. 1 Tiny语言的语法 下图是Tiny在BNF中的文法. 文法的定义能够看出.INNY语言有以下特点: 1 程序 ...

  3. Reading List on Automated Program Repair

    Some resources: https://www.monperrus.net/martin/automatic-software-repair 2017 [ ] DeepFix: Fixing ...

  4. [io benchmark]常用磁盘基准/压力测试工具

    Unix Disk I/O Benchmarks fio - NEW! fio is an I/O tool meant to be used both for benchmark and stres ...

  5. UVA - 10895 Matrix Transpose

    UVA - 10895 Matrix Transpose Time Limit:3000MS   Memory Limit:Unknown   64bit IO Format:%lld & % ...

  6. Awesome Go

    A curated list of awesome Go frameworks, libraries and software. Inspired by awesome-python. Contrib ...

  7. Go 语言相关的优秀框架,库及软件列表

    If you see a package or project here that is no longer maintained or is not a good fit, please submi ...

  8. Awesome Go (http://awesome-go.com/)

    A curated list of awesome Go frameworks, libraries and software. Inspired by awesome-python. Contrib ...

  9. Awesome Go精选的Go框架,库和软件的精选清单.A curated list of awesome Go frameworks, libraries and software

    Awesome Go      financial support to Awesome Go A curated list of awesome Go frameworks, libraries a ...

随机推荐

  1. centos7下源码安装mysql5.7.16

    一.下载源码包下载mysql源码包 http://mirrors.sohu.com/mysql/MySQL-5.7/mysql-5.7.16.tar.gz 二.安装约定: 用户名:mysql 安装目录 ...

  2. 表单提交的两种请求方式:post与get。post与get两者的对比分析

    post与get两者的对比分析:

  3. Castle ActiveRecord学习(五)使用HQL语句查询

    来源:http://www.cnblogs.com/Terrylee/archive/2006/04/12/372823.html 一.HQL简单介绍HQL全名是Hibernate Query Lan ...

  4. mvc EF 从数据库更新实体,添加视图实体时添加不上的问题

    视图对象没有一列为非null的,解决办法,在视图中,将某一列排除为null的可能,比如:isnull(te,1),即可.

  5. workerman使用

    1.start_timer.php(boc) <?php use \Workerman\Worker; use \Workerman\Lib\Timer; require_once '/var/ ...

  6. Git 初始状操作指引

    You have an empty repository To get started you will need to run these commands in your terminal. Ne ...

  7. mvc模拟实现

    .定义httpmodule <system.webServer> <modules> <add name="UrlRoutingModule" typ ...

  8. 使用delphi 开发多层应用(二十三)KbmMW 的WIB

    解释WIB 是什么之前,先回顾以下我们前面的各种服务工作方式.前面的各种服务的工作方式都是请求/应答方式. 客户端发送请求,服务器端根据客户端的请求,返回相应的结果.这种方式是一种顺序式访问,是一种紧 ...

  9. 2018.10.13 bzo1934: [Shoi2007]Vote 善意的投票(最小割)

    传送门 最小割定义题. 按照题意建边就行了. 考虑把冲突变成把aaa选入不与自己匹配的集合所需要付出的代价. 然后跑最小割就行了. 代码: #include<bits/stdc++.h> ...

  10. 2018.10.01 NOIP模拟 偷书(状压dp)

    传送门 状压dp经典题. 令f[i][j]f[i][j]f[i][j]表示到第i个,第i−k+1i-k+1i−k+1~iii个物品的状态是j时的最大总和. 然后简单维护一下转移就行了. 由于想皮一下果 ...