opencl gauss filter优化(二)

1.buffer使用image的方式：Horizontal 与 Vertical 算法一样, 共需30ms,wait time 19ms.

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

__kernel void ImageGaussianFilterHorizontal(__read_only image2d_t  source, // Source image

                            __write_only image2d_t   dest,  // Intermediate dest image

                                             const int imgWidth ,                // Image width

                                             const int imgHeight)

{

    const int y = get_global_id();

    if(y>=(imgHeight))

        return;

    const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0};

    const int s = ;

    const int nStart = ;

    float lines[];

    for(int i=;i<;i++)

        lines[i] = read_imagef( source, sampler,  (int2) (i-, y) ).x;

    for(int j=;j<imgWidth;){

    float sum = lines[nStart] * m_nFilter[nStart];

#define    GaussianTwoLines(m) \

    sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()    

        write_imagef( dest, (int2) (j, y), sum );

        for(int i = ; i<s-; i++) lines[i] = lines[i+];

        j++;

        lines[s-] = read_imagef( source, sampler, (int2) (j+, y) ).x;

    }

}

__kernel void ImageGaussianFilterVertical(__read_only image2d_t  source, // Source image

                        __write_only image2d_t   dest,

                         const int imgWidth ,

                        const int imgHeight)

{

    const int x = get_global_id();

    if(x>=(imgWidth))

        return;

    const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0};

    const int s = ;

    const int nStart = ;

    float lines[];

    for(int i=;i<;i++)

        lines[i] = read_imagef( source, sampler,  (int2) (x ,i-) ).x;

    for(int j=;j<imgHeight;){

    float sum = lines[nStart] * m_nFilter[nStart];

#define    GaussianTwoLines(m) \

    sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        write_imagef( dest, (int2) (x, j), sum );

        for(int i = ; i<s-; i++) lines[i] = lines[i+];

        j++;

        lines[s-] = read_imagef( source, sampler, (int2) (x,j+) ).x;

    }

}

2.只运行 Horizontal 19ms,wait time 19ms. 注释掉 write_imagef 2.4ms(wait time,run time都是0.0xms)(更新：sum计算被优化,0.x ms就是读image的时间).

a.顺序调整为：

lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

write_imagef( dest, (int2) (j-1, y), sum );

16.9ms,很奇怪sum用固定的0,0.2替代时间只有3.9ms?????把计算部分注释掉，只读写imgage,也是3.9ms, 计算sum的部分被编译器优化掉了？

b. if(sum>0)

lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

write_imagef( dest, (int2) (j-1, y), 0.2 );

如此测试,17ms,看来是sum的计算被优化掉了.

c.if(sum>=0)

j++;

//lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

//write_imagef( dest, (int2) (j-1, y), sum );

只计算,5.7ms,但还是wait time 5.7ms？？？

3.使用float16 vector 计算,总共耗时15.6 ms,wait time 9.3ms,rum time 6.3ms.使用 __attribute__ 能减少1ms以内.其中Horizontal:wait time 9.4ms,rum time 0.008ms ,Vertical:wait time 0.07ms,rum time 6.4ms.

不知道为什么使用fma指令替代sum+= ,需要近2s,而且localWorksize最大只能32.

使用half16 精度，反而还要17ms,而且结果有1-2的误差。

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

__kernel __attribute__((work_group_size_hint(,,)))

void ImageGaussianFilterHorizontal(__read_only image2d_t  source, // Source image

                                    __write_only image2d_t   dest,  // Intermediate dest image

                                     const int imgWidth ,                // Image width

                                     const int imgHeight)

{

    const int y = get_global_id();

    if(y>=(imgHeight))

        return;

    const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0};

#define r(xc,y) read_imagef( source, sampler,  (int2) (xc, y) ).x

#define r16(x,y) (float16)( r(x,y),r(x+1,y),r(x+2,y),r(x+3,y),r(x+4,y),r(x+5,y),r(x+6,y),r(x+7,y),\

                r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y))

#define w16(x,y,sum) write_imagef( dest, (int2) (x, y), sum.s0 );write_imagef( dest, (int2) (x+1, y), sum.s1 );\

        write_imagef( dest, (int2) (x+, y), sum.s2 );write_imagef( dest, (int2) (x+, y), sum.s3 );\

        write_imagef( dest, (int2) (x+, y), sum.s4 );write_imagef( dest, (int2) (x+, y), sum.s5 );\

        write_imagef( dest, (int2) (x+, y), sum.s6 );write_imagef( dest, (int2) (x+, y), sum.s7 );\

        write_imagef( dest, (int2) (x+, y), sum.s8 );write_imagef( dest, (int2) (x+, y), sum.s9 );\

        write_imagef( dest, (int2) (x+, y), sum.sa );write_imagef( dest, (int2) (x+, y), sum.sb );\

        write_imagef( dest, (int2) (x+, y), sum.sc );write_imagef( dest, (int2) (x+, y), sum.sd );\

        write_imagef( dest, (int2) (x+, y), sum.se );write_imagef( dest, (int2) (x+, y), sum.sf );

    float16 line0 =  r16(-,y);

    for(int j=;j<imgWidth;){

        float16 line1 =  r16(j-+,y);

        float16 temp0;

        float16 temp1;

        temp0 = line0;

        temp1.s0123 = line0.sabcd;

        temp1.s45 = line0.sef;

        temp1.s67 = line1.s01;

        temp1.s89abcdef = line1.s23456789;

        float16 sum =  ( temp0 + temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s0;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s9;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s1;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s8;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s2;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s7;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s3;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s6;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s4;

        sum += ( temp0 ) * m_nFilter[];

        line0 = line1;

        w16(j,y,sum );

        j+=;

    }

}

__kernel  __attribute__((work_group_size_hint(,,)))

void ImageGaussianFilterVertical(__read_only image2d_t  source, // Source image

                                __write_only image2d_t   dest,

                                 const int imgWidth ,

                                 const int imgHeight)

{

    const int x = get_global_id();

    if(x>=(imgWidth))

        return;

    const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0};

#define rv16(x,y) (float16)( r(x,y),r(x,y+1),r(x,y+2),r(x,y+3),r(x,y+4),r(x,y+5),r(x,y+6),r(x,y+7),\

                r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+))

#define wv16(x,y,sum) write_imagef( dest, (int2) (x,y), sum.s0 );write_imagef( dest, (int2) (x,y+1), sum.s1 );\

        write_imagef( dest, (int2) (x,y+), sum.s2 );write_imagef( dest, (int2) (x,y+), sum.s3 );\

        write_imagef( dest, (int2) (x,y+), sum.s4 );write_imagef( dest, (int2) (x,y+), sum.s5 );\

        write_imagef( dest, (int2) (x,y+), sum.s6 );write_imagef( dest, (int2) (x,y+), sum.s7 );\

        write_imagef( dest, (int2) (x,y+), sum.s8 );write_imagef( dest, (int2) (x,y+), sum.s9 );\

        write_imagef( dest, (int2) (x,y+), sum.sa );write_imagef( dest, (int2) (x,y+), sum.sb );\

        write_imagef( dest, (int2) (x,y+), sum.sc );write_imagef( dest, (int2) (x,y+), sum.sd );\

        write_imagef( dest, (int2) (x,y+), sum.se );write_imagef( dest, (int2) (x,y+), sum.sf );

    float16 line0 =  rv16(x,-);

    for(int j=;j<imgHeight;){

        float16 line1 =  rv16(x,j-+);

        float16 temp0;

        float16 temp1;

        temp0 = line0;

        temp1.s0123 = line0.sabcd;

        temp1.s45 = line0.sef;

        temp1.s67 = line1.s01;

        temp1.s89abcdef = line1.s23456789;

        float16 sum =  ( temp0 + temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s0;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s9;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s1;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s8;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s2;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s7;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s3;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s6;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s4;

        sum += ( temp0 ) * m_nFilter[];

        line0 = line1;

        wv16(x,j,sum );

        j+=;

    }

}

opencl gauss filter优化(二)的更多相关文章

opencl gauss filter优化(三)
1.根据前两次的最终结果: 使用普通buffer,Horizontal 5ms, Vertical 17 ms 使用image buffer:Horizontal 9.4ms, Vertical 6. ...
opencl gauss filter优化(一)
Platform: LG G3, Adreno 330 ,img size 3264x2448 C code neon GPU 300 60 29 单位:ms 1. 目前按如下行列分解的方式最快29m ...
Anisotropic gauss filter
最近一直在做版面分析,其中文本行检测方面,许多文章涉及到了Anigauss也就是各向异性高斯滤波. 顾名思义,简单的理解就是参数不同的二维高斯滤波. 在文章Fast Anisotropic Gauss ...
EMW 性能优化二之---并发配置
EMW 性能优化二之---并发配置在前一个日志中写到交货的异步更新,对于RFUI RF的前台操作会提升效率,异步更新不用等待更新状态的返回,启用更新队列的方式执行(SM13). 下面再补全性能相关的 ...
MySQL优化二（连接优化和缓存优化）
body { font-family: Helvetica, arial, sans-serif; font-size: 14px; line-height: 1.6; padding-top: 10 ...
mysql优化二之锁机制
mysql优化二之锁机制 mysql提供了锁机制和MVCC机制来保证并发操作的安全性,这里主要讨论锁机制, MVCC见下篇文章 mysql的锁按照锁粒度可分为行锁与表锁,按照操作类型划分可读锁和写锁 ...
Emacs 启动优化二三事
Emacs 启动优化二三事 */--> div.org-src-container { font-size: 85%; font-family: monospace; } p {font-siz ...
MySQL性能优化(二)：优化数据库的设计
原文:MySQL性能优化(二):优化数据库的设计版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https://blog.csdn.n ...
二维高斯滤波器（gauss filter）的实现
我们以一个二维矩阵表示二元高斯滤波器,显然此二维矩阵的具体形式仅于其形状(shape)有关: def gauss_filter(kernel_shape): 为实现二维高斯滤波器,需要首先定义二元高斯 ...

随机推荐

安装python-devel 在升级到python2.7之后
分别执行如下命令: # yum update # yum install centos-release-SCL # yum search all python27 在搜索出的列表中发现python27 ...
短信轰炸PC版
前言之前用过android版短信轰炸的apk,于是想反编apk查看源码找短信接口,做一个PC版本的,不料反编失败.后不了了之... 昨日逛论坛时无意中看到一个网站有此功能,打开一试究竟,效果可以,于 ...
Java的Properties类和读取.properties文件
一..properties文件的作用 Properties属性文件在JAVA应用程序中是经常可以看得见的,也是特别重要的一类文件.它用来配置应用程序的一些信息,不过这些信息一般都是比较少的数据,没有必 ...
Mysql-学习笔记（==》集合函数与分组四）
-- 聚集函数配合分组语句 group by-- 显示最高分SELECT MAX(sscore) FROM db.`student`;-- 显示最高分学生的信息min maxSELECT * FRO ...
java-pfx文件转换成16进制内容
public static void main(String[] args) throws Exception { String path = "D://111.pfx"; Inp ...
实现Action类
实现Action类 1.Action类的作用: (1)封装HTTP的请求参数: (2)处理用户请求: (3)封装处理结果. 2.Action类是什么,在Action类中应该包含什么: Action类就 ...
Swift 动画学习笔记
视频地址: http://www.swiftv.cn/course/i275v5lz 1,动画属性 position(位置),opacity(透明度,0 全透明,1 不透明),Scale(尺寸),Co ...
UVA 1452 八 Jump
Jump Time Limit:3000MS Memory Limit:0KB 64bit IO Format:%lld & %llu Submit Status Practi ...
样式表中的 element.style样式如何修改
我们在写前面 web样式的时候,会发现有些时候,我们怎么修改 style里面的值,页面上的样式都不会修改,当你用工具查看时,会发现里面会有 element.style的值,这个值还找不到是在哪里出现的 ...
【转】The decoupling capacitor…is it really necessary?
Before working as an applications engineer, I worked as an IC test development engineer here at TI. ...

opencl gauss filter优化(二)

opencl gauss filter优化(二)的更多相关文章

随机推荐

热门专题