Platform: LG G3, Adreno 330 ,img size 3264x2448

C code

neon

GPU

300

60

29

单位:ms

1.
目前按如下行列分解的方式最快29ms,Horizontal
kernel
globalWorksize[1] = {height+256-height%256};Vertical kernel
globalWorksize2[1] = {width+256-width%256};

localWorksize2[]
= {64}; localWorksize2 手动设为64时最快。

Porfile的结果为:Horizontal
kernel 的wait
time 有11ms,实际rum
time 18ms.

这个wait
time是什么呢?注释掉Horizontal
kernel中的
vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j)
; 则wait
time只有0.x
ms.并且
localWorksize
越小wait
time越长,为1时达到200ms,16时20ms.
难道是写内存等待时间,没有足够的ALU指令隐藏访存延时?写内存后进入下一个for循环,马上又读内存,所以没有ALU指令隐藏这个延时。然而Horizontal
kernel的profile结果实际run
time只有0.x
ms,所有时间基本都是在wait.(更正:注释掉vstore16后,sum的计算被优化掉了,0.x
ms是读内存的时间)

  1. __kernel void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
  2. __global uchar* restrict dest, // Intermediate dest image
  3. const int imgWidth , // Image width
  4. const int imgHeight)
  5. {
  6. const int y = get_global_id();
  7. if(y>=(imgHeight))
  8. return;
  9. const uchar m_nRightShiftNum = ;
  10. const uchar Rounding = ( << (m_nRightShiftNum - ));
  11. const uchar m_nFilter[] = {,,,,,,,,,,};
  12.  
  13. const int s = ;
  14. const int nStart = ;
  15. const int nWidth = imgWidth;
  16.  
  17. __global const uchar* pInLine = source + y*nWidth;
  18. __global uchar* pOutLine = dest + y*nWidth;
  19. int j;
  20. for(j = ; j < nStart; j ++)
  21. {
  22. ushort sum = ;
  23.  
  24. for (int m = ; m<s / ; m++)
  25. {
  26. int k1 = (j + m - nStart);
  27. k1 = k1< ? -k1 : k1;
  28.  
  29. int k2 = (j + nStart - m );
  30. sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
  31. }
  32. sum += pInLine[j] * m_nFilter[s / ];
  33. sum = (sum + Rounding) >> ;
  34. pOutLine[j] = (uchar)clamp(sum,(ushort),(ushort));
  35. }
  36.  
  37. for ( ; (j+)<= (nWidth - nStart); j+=)
  38. {
  39. #define GAUSSIAN_LINE_NEON(m) \
  40. sum += ( convert_ushort16(vload16(,pInLine+j-nStart+m))* m_nFilter[m] );
  41.  
  42. ushort16 sum = (convert_ushort16(vload16(,pInLine+j-nStart)) * m_nFilter[]);
  43. GAUSSIAN_LINE_NEON();
  44. GAUSSIAN_LINE_NEON();
  45. GAUSSIAN_LINE_NEON();
  46. GAUSSIAN_LINE_NEON();
  47. GAUSSIAN_LINE_NEON();
  48. GAUSSIAN_LINE_NEON();
  49. GAUSSIAN_LINE_NEON();
  50. GAUSSIAN_LINE_NEON();
  51. GAUSSIAN_LINE_NEON();
  52. GAUSSIAN_LINE_NEON();
  53.  
  54. sum += (ushort)Rounding;
  55. vstore16(convert_uchar16(sum>>(ushort)),,pOutLine+j) ;
  56. }
  57.  
  58. for( ; j < nWidth; j ++)
  59. {
  60. ushort sum = ;
  61.  
  62. for (int m = ; m<s / ; m++)
  63. {
  64. int k1 = (j + m - nStart);
  65.  
  66. int k2 = (j + nStart - m );
  67. k2 = k2 >= nWidth ? * nWidth - - k2 : k2;
  68. sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
  69. }
  70. sum += pInLine[j] * m_nFilter[s / ];
  71. sum = (sum + Rounding) >> m_nRightShiftNum;
  72. pOutLine[j] = (uchar)clamp(sum,(ushort),(ushort));
  73. }
  74. }
  75.  
  76. __kernel void ImageGaussianFilterVertical( __global uchar* restrict source, // Intermediate image processed by ImageGaussianFilterHorizontal()
  77. __global uchar* restrict dest, // Final destination image
  78. const int imgWidth,
  79. const int imgHeight
  80. )
  81. {
  82. const int x = get_global_id();
  83. if(x>=(imgWidth))
  84. return;
  85. const int x_offset = x;
  86.  
  87. const int s = ;
  88. const int nStart = s / ;
  89. const int m_nRightShiftNum = ;
  90. const int Rounding = ( << (m_nRightShiftNum - ));
  91. const uchar m_nFilter[] = {,,,,,,,,,,};
  92.  
  93. int y;
  94. // mem_fence(CLK_LOCAL_MEM_FENCE);
  95.  
  96. ushort lines[];
  97. lines[nStart] = (ushort)( source[x_offset] );
  98. for(y=;y<=nStart;y++)
  99. {
  100. lines[nStart+y] = (ushort)( source[y*imgWidth+x_offset] );
  101. lines[nStart-y] = lines[nStart+y];
  102. }
  103.  
  104. for(y=;y<(imgHeight-nStart-);)
  105. {
  106.  
  107. ushort sum = lines[nStart] * m_nFilter[nStart];
  108. #define GaussianTwoLines(m) \
  109. sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );
  110. GaussianTwoLines()
  111. GaussianTwoLines()
  112. GaussianTwoLines()
  113. GaussianTwoLines()
  114. GaussianTwoLines()
  115.  
  116. sum += (ushort)Rounding;
  117. dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort));
  118.  
  119. y++;
  120. for(int i = ; i<s-; i++) lines[i] = lines[i+];
  121.  
  122. lines[s-] = (ushort)( source[(y+nStart)*imgWidth+x_offset] );
  123.  
  124. }
  125.  
  126. for(y=imgHeight-nStart-;y<(imgHeight-);)
  127. {
  128. ushort sum = lines[nStart] * m_nFilter[nStart];
  129. GaussianTwoLines()
  130. GaussianTwoLines()
  131. GaussianTwoLines()
  132. GaussianTwoLines()
  133. GaussianTwoLines()
  134.  
  135. sum += (ushort)Rounding;
  136. dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort));
  137.  
  138. y++;
  139. for(int i = ; i<s-; i++) {
  140. lines[i] = lines[i+];
  141. }
  142. lines[s-] = lines[(imgHeight-y)*-] ; //
  143. }
  144. //last y=imgHeight-1
  145. ushort sum = lines[nStart] * m_nFilter[nStart];
  146. GaussianTwoLines()
  147. GaussianTwoLines()
  148. GaussianTwoLines()
  149. GaussianTwoLines()
  150. GaussianTwoLines()
  151.  
  152. sum += (ushort)Rounding;
  153. dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort));
  154. }

kernel

2.Horizontal kernel改进,预先load 2x16个所需的pixel,计算时从中提取,这样每次循环只需读一次内存。需要26ms,wait time 8ms.

  1. ushort16 line0 = convert_ushort16(vload16(,pInLine+j-nStart));
  2. for ( ; (j+)<= (nWidth - nStart); j+=)
  3. {
  4. ushort16 line1 = convert_ushort16(vload16(,pInLine+j-nStart+));
  5.  
  6. ushort16 temp0;
  7. ushort16 temp1;
  8. temp0 = line0;
  9. temp1.s0123 = line0.sabcd;
  10. temp1.s45 = line0.sef;
  11. temp1.s67 = line1.s01;
  12. temp1.s89abcdef = line1.s23456789;
  13. ushort16 sum = ( temp0 + temp1 ) * m_nFilter[];
  14. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  15. temp0.sf = line1.s0;
  16. temp1.s0123456789abcdef = temp1.s00123456789abcde;
  17. temp1.s0 = line0.s9;
  18. sum += ( temp0 + temp1 ) * m_nFilter[];
  19. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  20. temp0.sf = line1.s1;
  21. temp1.s0123456789abcdef = temp1.s00123456789abcde;
  22. temp1.s0 = line0.s8;
  23. sum += ( temp0 + temp1 ) * m_nFilter[];
  24. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  25. temp0.sf = line1.s2;
  26. temp1.s0123456789abcdef = temp1.s00123456789abcde;
  27. temp1.s0 = line0.s7;
  28. sum += ( temp0 + temp1 ) * m_nFilter[];
  29. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  30. temp0.sf = line1.s3;
  31. temp1.s0123456789abcdef = temp1.s00123456789abcde;
  32. temp1.s0 = line0.s6;
  33. sum += ( temp0 + temp1 ) * m_nFilter[];
  34. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  35. temp0.sf = line1.s4;
  36. sum += ( temp0 ) * m_nFilter[];
  37.  
  38. sum += (ushort)Rounding;
  39. line0 = line1;
  40. vstore16(convert_uchar16(sum>>(ushort)),,pOutLine+j) ;
  41. }

3.不计算,只读写内存测试。那么wait time 3.2 ms,run time 18.2 ms.说明Horizontal kernel 耗时的极限也需3.2ms. 但是只是注释掉vstore16,还保留了读和计算,反而wait time还只有0.x ms,这又是为何?是读几乎没有wait,3.2ms都是写的wait time? (更正:注释掉vstore16后,sum的计算被优化掉了,0.x ms是读内存的时间)

a.再次测试,只有读wait time 0.xms ,只有写wait time 3.2ms.写比读的周期长.

for ( ; (j+16)<= (nWidth - nStart); j+=16)

{

ushort16 line1 = convert_ushort16(vload16(0,pInLine+j-nStart+16));

vstore16(0,0,pOutLine+j) ;

}

b.另外发现使用*((__global uint4*)(pOutLine+j)) = as_uint4(result);比vstore16快,wait time 2.5ms.高通 80-N8592-1_L_OpenCL_Programming_Guide 中提到:

Vectorized load/store of a larger data type is more optimal than a small data type; e.g., a load of uint2* is more optimal than uchar8* .

For optimal SP to L2 bandwidth performance, align read access to a 32-bit address and write access to a 128-bit address.

c.原来写的内存没有对齐,使用*((__global uint4*)(pOutLine+j-5)) = as_uint4(result);wait time 1.9ms.

d.最后加上sum计算,采用的Horizontal kernel如下,localWorksize[] = {64};时时间最少,需要23ms,wait time 4.7ms , localWorksize = 128时,wait 6ms.

并且使用__attribute__((work_group_size_hint(64,1,1))) ,耗时22ms.

  1. __kernel __attribute__((work_group_size_hint(,,)))
  2. void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
  3. __global uchar* restrict dest, // Intermediate dest image
  4. const int imgWidth , // Image width
  5. const int imgHeight)
  6. {
  7. const int y = get_global_id();
  8. if(y>=(imgHeight))
  9. return;
  10. const uchar m_nRightShiftNum = ;
  11. const uchar Rounding = ( << (m_nRightShiftNum - ));
  12. const uchar m_nFilter[] = {,,,,,,,,,,};
  13.  
  14. const int s = ;
  15. const int nStart = ;
  16. const int nWidth = imgWidth;
  17.  
  18. __global const uchar* pInLine = source + y*nWidth;
  19. __global uchar* pOutLine = dest + y*nWidth;
  20.  
  21. int j;
  22. uchar temp[];
  23. for(j = ; j < nStart; j ++)
  24. {
  25. ushort sum = ;
  26.  
  27. for (int m = ; m<s / ; m++)
  28. {
  29. int k1 = (j + m - nStart);
  30. k1 = k1< ? -k1 : k1;
  31.  
  32. int k2 = (j + nStart - m );
  33. sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
  34. }
  35. sum += pInLine[j] * m_nFilter[s / ];
  36. sum = (sum + Rounding) >> ;
  37. temp[j] = (uchar)clamp(sum,(ushort),(ushort));
  38. }
  39.  
  40. uchar16 result,pre_result;
  41. pre_result.sbcde = (uchar4)(temp[],temp[],temp[],temp[]);
  42. pre_result.sf = temp[];
  43.  
  44. ushort16 line0 = convert_ushort16(vload16(,pInLine+j-nStart));
  45. for ( ; (j+)<= (nWidth - nStart); j+=)
  46. {
  47. //prefetch(pInLine+j-nStart,32); //无变化
  48. ushort16 line1 = convert_ushort16(vload16(,pInLine+j-nStart+));
  49.  
  50. ushort16 temp0;
  51. ushort16 temp1;
  52. temp0 = line0;
  53. temp1.s0123 = line0.sabcd;
  54. temp1.s45 = line0.sef;
  55. temp1.s67 = line1.s01;
  56. temp1.s89abcdef = line1.s23456789;
  57. ushort16 sum = ( temp0 + temp1 ) * m_nFilter[];
  58. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  59. temp0.sf = line1.s0;
  60. temp1.s0123456789abcdef = temp1.s00123456789abcde;
  61. temp1.s0 = line0.s9;
  62. sum += ( temp0 + temp1 ) * m_nFilter[];
  63. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  64. temp0.sf = line1.s1;
  65. temp1.s0123456789abcdef = temp1.s00123456789abcde;
  66. temp1.s0 = line0.s8;
  67. sum += ( temp0 + temp1 ) * m_nFilter[];
  68. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  69. temp0.sf = line1.s2;
  70. temp1.s0123456789abcdef = temp1.s00123456789abcde;
  71. temp1.s0 = line0.s7;
  72. sum += ( temp0 + temp1 ) * m_nFilter[];
  73. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  74. temp0.sf = line1.s3;
  75. temp1.s0123456789abcdef = temp1.s00123456789abcde;
  76. temp1.s0 = line0.s6;
  77. sum += ( temp0 + temp1 ) * m_nFilter[];
  78. temp0.s0123456789abcdef = temp0.s123456789abcdeff;
  79. temp0.sf = line1.s4;
  80. sum += ( temp0 ) * m_nFilter[];
  81.  
  82. sum += (ushort)Rounding;
  83. line0 = line1;
  84.  
  85. result.s0123 = pre_result.sbcde;
  86. result.s4 = pre_result.sf;
  87. pre_result = convert_uchar16(sum>>(ushort)) ;
  88.  
  89. result.s5 = pre_result.s0;
  90. result.s67 = pre_result.s12;
  91. result.s89abcdef = pre_result.s3456789a;
  92. *( (__global uint4*)(pOutLine+j-) ) = (as_uint4)(result) ;
  93. }
  94.  
  95. *( (__global uint*)(pOutLine+j-) ) = (as_uint)(pre_result.sbcde);//last 5 bytes
  96. pOutLine[j-] = pre_result.sf;
  97.  
  98. for( ; j < nWidth; j ++)
  99. {
  100. ushort sum = ;
  101.  
  102. for (int m = ; m<s / ; m++)
  103. {
  104. int k1 = (j + m - nStart);
  105.  
  106. int k2 = (j + nStart - m );
  107. k2 = k2 >= nWidth ? * nWidth - - k2 : k2;
  108. sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
  109. }
  110. sum += pInLine[j] * m_nFilter[s / ];
  111. sum = (sum + Rounding) >> m_nRightShiftNum;
  112. pOutLine[j] = (uchar)clamp(sum,(ushort),(ushort));
  113. }
  114. }

opencl gauss filter优化(一)的更多相关文章

  1. opencl gauss filter优化(三)

    1.根据前两次的最终结果: 使用普通buffer,Horizontal 5ms, Vertical 17 ms 使用image buffer:Horizontal 9.4ms, Vertical 6. ...

  2. opencl gauss filter优化(二)

    1.buffer使用image的方式:Horizontal 与 Vertical 算法一样, 共需30ms,wait time 19ms. const sampler_t sampler = CLK_ ...

  3. Anisotropic gauss filter

    最近一直在做版面分析,其中文本行检测方面,许多文章涉及到了Anigauss也就是各向异性高斯滤波. 顾名思义,简单的理解就是参数不同的二维高斯滤波. 在文章Fast Anisotropic Gauss ...

  4. OpenCL Kernel设计优化

    使用Intel® FPGA SDK for OpenCL™ 离线编译器,不需要调整kernel代码便可以将其最佳的适应于固定的硬件设备,而是离线编译器会根据kernel的要求自适应调整硬件的结构. 通 ...

  5. FILTER优化

    explain plan for select a.* from fxqd_list_20131115_new_100 a where (acct_no, oper_no, seqno, trans_ ...

  6. 二维高斯滤波器(gauss filter)的实现

    我们以一个二维矩阵表示二元高斯滤波器,显然此二维矩阵的具体形式仅于其形状(shape)有关: def gauss_filter(kernel_shape): 为实现二维高斯滤波器,需要首先定义二元高斯 ...

  7. 一次性能优化将filter转换

    有一条SQL性能有问题,在运行计划中发现filter.遇到它要小心了,类似于nestloop.我曾经的blog对它有研究探索运行计划中filter的原理.用exists极易引起filter. 优化前: ...

  8. 安卓平台ARM Mali OpenCL例子-灰度转换(转)

    手头一块RK3288的板子,在板子上测试了一张1080p的彩色图灰度转换的OpenCL例子.OpenCL没有任何优化.例子请移步这里. 该例子是编译成安卓平台下的可执行程序. 进入jni文件夹,进行如 ...

  9. OpenCV、OpenCL、OpenGL、OpenPCL

    对于几个开源库的总结,作为标记,以前看过,现在开始重视起来!更详细资料请移步 开源中国社区! 涉及:OpenCV,OpenCL,OpenGL,OpenPCL 截止到目前: OpenGL的最新版本为4. ...

随机推荐

  1. CurrentHashMap的实现原理

    转载:http://wiki.jikexueyuan.com/project/java-collection/concurrenthashmap.html 概述 我们在之前的博文中了解到关于 Hash ...

  2. git总结

    1.先画个图,先对git的操作有个直观了解 2.分析下git中文件是怎么存储的 正如下面所示git存储不是每次更改就会产生一个新的文件,而是产生一个版本,这个版本对应着记录每个文件的不同情况 具体的存 ...

  3. nn package

    1.nn模块是神经网络模块 2.父类module,子类Sequential, Parallel和Concat 3.Linear:做线性变换 4.criterion 这个模块包含了各式各样的训练时的损失 ...

  4. 2016年11月22日 星期二 --出埃及记 Exodus 20:13

    2016年11月22日 星期二 --出埃及记 Exodus 20:13 "You shall not murder.不可杀人.

  5. CentOS 6.3下源码安装LAMP(Linux+Apache+Mysql+Php)环境

    一.简介 什么是LAMP    LAMP是一种Web网络应用和开发环境,是Linux, Apache, MySQL, Php/Perl的缩写,每一个字母代表了一个组件,每个组件就其本身而言都是在它所代 ...

  6. 我的android学习经历18

    今天主要学了几个android控件和使用两个适配器 ListView DatePicker和TimePicker GridView 适配器:SimpleAdapter和ArrayAdapter 都是常 ...

  7. HDU 5046 Airport(dlx)

    题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=5046 题意:n个城市修建m个机场,使得每个城市到最近进场的最大值最小. 思路:二分+dlx搜索判定. ...

  8. 只用css来美化的上传表单按钮(抄的迅雷的)

    <!DOCTYPE html><html><head><meta charset="utf-8" /><title>文件 ...

  9. 用PyAIML开发简单的对话机器人

    AIML files are a subset of Extensible Mark-up Language (XML) that can store different text patterns ...

  10. Populating Display Item Value On Query In Oracle Forms

    Write Post-Query trigger for the block you want to fetch the field value for display item.ExampleBeg ...