1.buffer使用image的方式:Horizontal 与 Vertical 算法一样, 共需30ms,wait time 19ms.

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void ImageGaussianFilterHorizontal(__read_only image2d_t source, // Source image
__write_only image2d_t dest, // Intermediate dest image
const int imgWidth , // Image width
const int imgHeight)
{
const int y = get_global_id();
if(y>=(imgHeight))
return;
const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0}; const int s = ;
const int nStart = ; float lines[];
for(int i=;i<;i++)
lines[i] = read_imagef( source, sampler, (int2) (i-, y) ).x; for(int j=;j<imgWidth;){
float sum = lines[nStart] * m_nFilter[nStart];
#define GaussianTwoLines(m) \
sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines() write_imagef( dest, (int2) (j, y), sum ); for(int i = ; i<s-; i++) lines[i] = lines[i+];
j++;
lines[s-] = read_imagef( source, sampler, (int2) (j+, y) ).x;
}
} __kernel void ImageGaussianFilterVertical(__read_only image2d_t source, // Source image
__write_only image2d_t dest,
const int imgWidth ,
const int imgHeight)
{
const int x = get_global_id();
if(x>=(imgWidth))
return;
const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0}; const int s = ;
const int nStart = ; float lines[];
for(int i=;i<;i++)
lines[i] = read_imagef( source, sampler, (int2) (x ,i-) ).x; for(int j=;j<imgHeight;){
float sum = lines[nStart] * m_nFilter[nStart];
#define GaussianTwoLines(m) \
sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines() write_imagef( dest, (int2) (x, j), sum ); for(int i = ; i<s-; i++) lines[i] = lines[i+];
j++;
lines[s-] = read_imagef( source, sampler, (int2) (x,j+) ).x;
}
}

2.只运行 Horizontal 19ms,wait time 19ms. 注释掉 write_imagef 2.4ms(wait time,run time都是0.0xms)(更新:sum计算被优化,0.x ms就是读image的时间).

a.顺序调整为:

lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

write_imagef( dest, (int2) (j-1, y), sum );

16.9ms,很奇怪sum用固定的0,0.2替代时间只有3.9ms?????把计算部分注释掉,只读写imgage,也是3.9ms, 计算sum的部分被编译器优化掉了?

b. if(sum>0)

lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

write_imagef( dest, (int2) (j-1, y), 0.2 );

如此测试,17ms,看来是sum的计算被优化掉了.

c.if(sum>=0)

j++;

//lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

//write_imagef( dest, (int2) (j-1, y), sum );

只计算,5.7ms,但还是wait time 5.7ms???

3.使用float16 vector 计算,总共耗时15.6 ms,wait time 9.3ms,rum time 6.3ms.使用 __attribute__ 能减少1ms以内.其中Horizontal:wait time 9.4ms,rum time 0.008ms ,Vertical:wait time 0.07ms,rum time 6.4ms.

不知道为什么使用fma指令替代sum+= ,需要近2s,而且localWorksize最大只能32.

使用half16 精度,反而还要17ms,而且结果有1-2的误差。

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

__kernel __attribute__((work_group_size_hint(,,)))
void ImageGaussianFilterHorizontal(__read_only image2d_t source, // Source image
__write_only image2d_t dest, // Intermediate dest image
const int imgWidth , // Image width
const int imgHeight)
{
const int y = get_global_id();
if(y>=(imgHeight))
return;
const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0}; #define r(xc,y) read_imagef( source, sampler, (int2) (xc, y) ).x
#define r16(x,y) (float16)( r(x,y),r(x+1,y),r(x+2,y),r(x+3,y),r(x+4,y),r(x+5,y),r(x+6,y),r(x+7,y),\
r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y)) #define w16(x,y,sum) write_imagef( dest, (int2) (x, y), sum.s0 );write_imagef( dest, (int2) (x+1, y), sum.s1 );\
write_imagef( dest, (int2) (x+, y), sum.s2 );write_imagef( dest, (int2) (x+, y), sum.s3 );\
write_imagef( dest, (int2) (x+, y), sum.s4 );write_imagef( dest, (int2) (x+, y), sum.s5 );\
write_imagef( dest, (int2) (x+, y), sum.s6 );write_imagef( dest, (int2) (x+, y), sum.s7 );\
write_imagef( dest, (int2) (x+, y), sum.s8 );write_imagef( dest, (int2) (x+, y), sum.s9 );\
write_imagef( dest, (int2) (x+, y), sum.sa );write_imagef( dest, (int2) (x+, y), sum.sb );\
write_imagef( dest, (int2) (x+, y), sum.sc );write_imagef( dest, (int2) (x+, y), sum.sd );\
write_imagef( dest, (int2) (x+, y), sum.se );write_imagef( dest, (int2) (x+, y), sum.sf ); float16 line0 = r16(-,y);
for(int j=;j<imgWidth;){
float16 line1 = r16(j-+,y); float16 temp0;
float16 temp1;
temp0 = line0;
temp1.s0123 = line0.sabcd;
temp1.s45 = line0.sef;
temp1.s67 = line1.s01;
temp1.s89abcdef = line1.s23456789;
float16 sum = ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s0;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s9;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s1;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s8;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s2;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s7;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s3;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s6;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s4;
sum += ( temp0 ) * m_nFilter[]; line0 = line1;
w16(j,y,sum );
j+=;
} } __kernel __attribute__((work_group_size_hint(,,)))
void ImageGaussianFilterVertical(__read_only image2d_t source, // Source image
__write_only image2d_t dest,
const int imgWidth ,
const int imgHeight)
{
const int x = get_global_id();
if(x>=(imgWidth))
return;
const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0}; #define rv16(x,y) (float16)( r(x,y),r(x,y+1),r(x,y+2),r(x,y+3),r(x,y+4),r(x,y+5),r(x,y+6),r(x,y+7),\
r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+)) #define wv16(x,y,sum) write_imagef( dest, (int2) (x,y), sum.s0 );write_imagef( dest, (int2) (x,y+1), sum.s1 );\
write_imagef( dest, (int2) (x,y+), sum.s2 );write_imagef( dest, (int2) (x,y+), sum.s3 );\
write_imagef( dest, (int2) (x,y+), sum.s4 );write_imagef( dest, (int2) (x,y+), sum.s5 );\
write_imagef( dest, (int2) (x,y+), sum.s6 );write_imagef( dest, (int2) (x,y+), sum.s7 );\
write_imagef( dest, (int2) (x,y+), sum.s8 );write_imagef( dest, (int2) (x,y+), sum.s9 );\
write_imagef( dest, (int2) (x,y+), sum.sa );write_imagef( dest, (int2) (x,y+), sum.sb );\
write_imagef( dest, (int2) (x,y+), sum.sc );write_imagef( dest, (int2) (x,y+), sum.sd );\
write_imagef( dest, (int2) (x,y+), sum.se );write_imagef( dest, (int2) (x,y+), sum.sf ); float16 line0 = rv16(x,-);
for(int j=;j<imgHeight;){
float16 line1 = rv16(x,j-+); float16 temp0;
float16 temp1;
temp0 = line0;
temp1.s0123 = line0.sabcd;
temp1.s45 = line0.sef;
temp1.s67 = line1.s01;
temp1.s89abcdef = line1.s23456789;
float16 sum = ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s0;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s9;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s1;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s8;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s2;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s7;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s3;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s6;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s4;
sum += ( temp0 ) * m_nFilter[]; line0 = line1;
wv16(x,j,sum );
j+=;
}
}

opencl gauss filter优化(二)的更多相关文章

  1. opencl gauss filter优化(三)

    1.根据前两次的最终结果: 使用普通buffer,Horizontal 5ms, Vertical 17 ms 使用image buffer:Horizontal 9.4ms, Vertical 6. ...

  2. opencl gauss filter优化(一)

    Platform: LG G3, Adreno 330 ,img size 3264x2448 C code neon GPU 300 60 29 单位:ms 1. 目前按如下行列分解的方式最快29m ...

  3. Anisotropic gauss filter

    最近一直在做版面分析,其中文本行检测方面,许多文章涉及到了Anigauss也就是各向异性高斯滤波. 顾名思义,简单的理解就是参数不同的二维高斯滤波. 在文章Fast Anisotropic Gauss ...

  4. EMW 性能优化二之---并发配置

    EMW 性能优化二之---并发配置 在前一个日志中写到交货的异步更新,对于RFUI RF的前台操作会提升效率,异步更新不用等待更新状态的返回,启用更新队列的方式执行(SM13). 下面再补全性能相关的 ...

  5. MySQL优化二(连接优化和缓存优化)

    body { font-family: Helvetica, arial, sans-serif; font-size: 14px; line-height: 1.6; padding-top: 10 ...

  6. mysql优化二之锁机制

    mysql优化二之锁机制 mysql提供了锁机制和MVCC机制来保证并发操作的安全性,这里主要讨论锁机制, MVCC见下篇文章 mysql的锁按照锁粒度可分为行锁与表锁,按照操作类型划分可读锁和写锁 ...

  7. Emacs 启动优化二三事

    Emacs 启动优化二三事 */--> div.org-src-container { font-size: 85%; font-family: monospace; } p {font-siz ...

  8. MySQL性能优化(二):优化数据库的设计

    原文:MySQL性能优化(二):优化数据库的设计 版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https://blog.csdn.n ...

  9. 二维高斯滤波器(gauss filter)的实现

    我们以一个二维矩阵表示二元高斯滤波器,显然此二维矩阵的具体形式仅于其形状(shape)有关: def gauss_filter(kernel_shape): 为实现二维高斯滤波器,需要首先定义二元高斯 ...

随机推荐

  1. 实现gabor filter的滤波

    实现gabor filter的滤波       图像纹理对于航空遥感图片.织物图案.复杂自然风景和动植物都适合.这里我采用遥感图片.织物图案和钢铁表面来做,并和canny图片进行一定的对比.     ...

  2. common-pool2 使用

    common-pool2提供了3中对象池管理方式,它们的使用方式基本一样,这里以GenericObjectPool对象池为例介绍其使用方式,一般实现自己的对象池需要经过2个步骤 1.实现PooledO ...

  3. 使用Jvisualvm监控JVM的内存、CPU、线程

    最近做性能测试发现很多性能问题,面对一些开发小白的数据结构思想,真想喊一声:放开那个代码,让我来!冲动. 面对WEB站点开发,性能测试是经常要做的,下面一种介绍如何结合性能测试工具,更好的监控WEB服 ...

  4. iOS—最全的真机测试教程

    准备 开发者账号 自从Xcode7 出来之后,一般的真机测试不需要开发者账号,也就不需要看这篇教程,只有app具有“推送”等功能的时候,要真机测试就必须要开发者账号和设置证书.苹果只是让你体验一下它的 ...

  5. winform继承窗体,无法修改父窗体控件问题处理笔记

    问题描述: 一个窗体集成父窗体,发现无法直接修改父窗体的控件,比如修改大小等,父窗体控件已经设置为public,如果做成一个dll被引用无此问题 特征: 不禁使父窗体控件,就算新加一个控件也会这样:鼠 ...

  6. An Example of On-Error Trigger in Oracle Forms

    I wrote this trigger around 4 years ago to handle errors in an application based on Oracle Forms 6i. ...

  7. [HDOJ5933]ArcSoft's Office Rearrangement(贪心)

    题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=5933 题意:长度为nn的数组: a_1, a_2, \cdotsa​1​​,a​2​​,⋯, 每次操作 ...

  8. window--窗口

    创建窗口 1. 通过标签窗口窗口. <div id="win" class="easyui-window" title="My Window&q ...

  9. Struts+Hibernate+jsp页面 实现分页

    dao层数据库代码: package com.hanqi.dao; import java.util.ArrayList; import java.util.List; import org.hibe ...

  10. Java开发中经典的小实例-(随机数)

    import java.util.Random;//输出小于33的7个不相同的随机数public class probability {    static Random random = new R ...