opencl gauss filter优化(一)
Platform: LG G3, Adreno 330 ,img size 3264x2448
C code |
neon |
GPU |
300 |
60 |
29 |
单位:ms
1.
目前按如下行列分解的方式最快29ms,Horizontal
kernel
globalWorksize[1] = {height+256-height%256};Vertical kernel
globalWorksize2[1] = {width+256-width%256};
localWorksize2[]
= {64}; localWorksize2 手动设为64时最快。
Porfile的结果为:Horizontal
kernel 的wait
time 有11ms,实际rum
time 18ms.
这个wait
time是什么呢?注释掉Horizontal
kernel中的
vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j)
; 则wait
time只有0.x
ms.并且
localWorksize
越小wait
time越长,为1时达到200ms,16时20ms.
难道是写内存等待时间,没有足够的ALU指令隐藏访存延时?写内存后进入下一个for循环,马上又读内存,所以没有ALU指令隐藏这个延时。然而Horizontal
kernel的profile结果实际run
time只有0.x
ms,所有时间基本都是在wait.(更正:注释掉vstore16后,sum的计算被优化掉了,0.x
ms是读内存的时间)
__kernel void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
__global uchar* restrict dest, // Intermediate dest image
const int imgWidth , // Image width
const int imgHeight)
{
const int y = get_global_id();
if(y>=(imgHeight))
return;
const uchar m_nRightShiftNum = ;
const uchar Rounding = ( << (m_nRightShiftNum - ));
const uchar m_nFilter[] = {,,,,,,,,,,}; const int s = ;
const int nStart = ;
const int nWidth = imgWidth; __global const uchar* pInLine = source + y*nWidth;
__global uchar* pOutLine = dest + y*nWidth;
int j;
for(j = ; j < nStart; j ++)
{
ushort sum = ; for (int m = ; m<s / ; m++)
{
int k1 = (j + m - nStart);
k1 = k1< ? -k1 : k1; int k2 = (j + nStart - m );
sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
}
sum += pInLine[j] * m_nFilter[s / ];
sum = (sum + Rounding) >> ;
pOutLine[j] = (uchar)clamp(sum,(ushort),(ushort));
} for ( ; (j+)<= (nWidth - nStart); j+=)
{
#define GAUSSIAN_LINE_NEON(m) \
sum += ( convert_ushort16(vload16(,pInLine+j-nStart+m))* m_nFilter[m] ); ushort16 sum = (convert_ushort16(vload16(,pInLine+j-nStart)) * m_nFilter[]);
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON();
GAUSSIAN_LINE_NEON(); sum += (ushort)Rounding;
vstore16(convert_uchar16(sum>>(ushort)),,pOutLine+j) ;
} for( ; j < nWidth; j ++)
{
ushort sum = ; for (int m = ; m<s / ; m++)
{
int k1 = (j + m - nStart); int k2 = (j + nStart - m );
k2 = k2 >= nWidth ? * nWidth - - k2 : k2;
sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
}
sum += pInLine[j] * m_nFilter[s / ];
sum = (sum + Rounding) >> m_nRightShiftNum;
pOutLine[j] = (uchar)clamp(sum,(ushort),(ushort));
}
} __kernel void ImageGaussianFilterVertical( __global uchar* restrict source, // Intermediate image processed by ImageGaussianFilterHorizontal()
__global uchar* restrict dest, // Final destination image
const int imgWidth,
const int imgHeight
)
{
const int x = get_global_id();
if(x>=(imgWidth))
return;
const int x_offset = x; const int s = ;
const int nStart = s / ;
const int m_nRightShiftNum = ;
const int Rounding = ( << (m_nRightShiftNum - ));
const uchar m_nFilter[] = {,,,,,,,,,,}; int y;
// mem_fence(CLK_LOCAL_MEM_FENCE); ushort lines[];
lines[nStart] = (ushort)( source[x_offset] );
for(y=;y<=nStart;y++)
{
lines[nStart+y] = (ushort)( source[y*imgWidth+x_offset] );
lines[nStart-y] = lines[nStart+y];
} for(y=;y<(imgHeight-nStart-);)
{ ushort sum = lines[nStart] * m_nFilter[nStart];
#define GaussianTwoLines(m) \
sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines() sum += (ushort)Rounding;
dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort)); y++;
for(int i = ; i<s-; i++) lines[i] = lines[i+]; lines[s-] = (ushort)( source[(y+nStart)*imgWidth+x_offset] ); } for(y=imgHeight-nStart-;y<(imgHeight-);)
{
ushort sum = lines[nStart] * m_nFilter[nStart];
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines() sum += (ushort)Rounding;
dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort)); y++;
for(int i = ; i<s-; i++) {
lines[i] = lines[i+];
}
lines[s-] = lines[(imgHeight-y)*-] ; //
}
//last y=imgHeight-1
ushort sum = lines[nStart] * m_nFilter[nStart];
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines()
GaussianTwoLines() sum += (ushort)Rounding;
dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort));
}
kernel
2.Horizontal kernel改进,预先load 2x16个所需的pixel,计算时从中提取,这样每次循环只需读一次内存。需要26ms,wait time 8ms.
ushort16 line0 = convert_ushort16(vload16(,pInLine+j-nStart));
for ( ; (j+)<= (nWidth - nStart); j+=)
{
ushort16 line1 = convert_ushort16(vload16(,pInLine+j-nStart+)); ushort16 temp0;
ushort16 temp1;
temp0 = line0;
temp1.s0123 = line0.sabcd;
temp1.s45 = line0.sef;
temp1.s67 = line1.s01;
temp1.s89abcdef = line1.s23456789;
ushort16 sum = ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s0;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s9;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s1;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s8;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s2;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s7;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s3;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s6;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s4;
sum += ( temp0 ) * m_nFilter[]; sum += (ushort)Rounding;
line0 = line1;
vstore16(convert_uchar16(sum>>(ushort)),,pOutLine+j) ;
}
3.不计算,只读写内存测试。那么wait time 3.2 ms,run time 18.2 ms.说明Horizontal kernel 耗时的极限也需3.2ms. 但是只是注释掉vstore16,还保留了读和计算,反而wait time还只有0.x ms,这又是为何?是读几乎没有wait,3.2ms都是写的wait time? (更正:注释掉vstore16后,sum的计算被优化掉了,0.x ms是读内存的时间)
a.再次测试,只有读wait time 0.xms ,只有写wait time 3.2ms.写比读的周期长.
for ( ; (j+16)<= (nWidth - nStart); j+=16)
{
ushort16 line1 = convert_ushort16(vload16(0,pInLine+j-nStart+16));
vstore16(0,0,pOutLine+j) ;
}
b.另外发现使用*((__global uint4*)(pOutLine+j)) = as_uint4(result);比vstore16快,wait time 2.5ms.高通 80-N8592-1_L_OpenCL_Programming_Guide 中提到:
Vectorized load/store of a larger data type is more optimal than a small data type; e.g., a load of uint2* is more optimal than uchar8* .
For optimal SP to L2 bandwidth performance, align read access to a 32-bit address and write access to a 128-bit address.
c.原来写的内存没有对齐,使用*((__global uint4*)(pOutLine+j-5)) = as_uint4(result);wait time 1.9ms.
d.最后加上sum计算,采用的Horizontal kernel如下,localWorksize[] = {64};时时间最少,需要23ms,wait time 4.7ms , localWorksize = 128时,wait 6ms.
并且使用__attribute__((work_group_size_hint(64,1,1))) ,耗时22ms.
__kernel __attribute__((work_group_size_hint(,,)))
void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
__global uchar* restrict dest, // Intermediate dest image
const int imgWidth , // Image width
const int imgHeight)
{
const int y = get_global_id();
if(y>=(imgHeight))
return;
const uchar m_nRightShiftNum = ;
const uchar Rounding = ( << (m_nRightShiftNum - ));
const uchar m_nFilter[] = {,,,,,,,,,,}; const int s = ;
const int nStart = ;
const int nWidth = imgWidth; __global const uchar* pInLine = source + y*nWidth;
__global uchar* pOutLine = dest + y*nWidth; int j;
uchar temp[];
for(j = ; j < nStart; j ++)
{
ushort sum = ; for (int m = ; m<s / ; m++)
{
int k1 = (j + m - nStart);
k1 = k1< ? -k1 : k1; int k2 = (j + nStart - m );
sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
}
sum += pInLine[j] * m_nFilter[s / ];
sum = (sum + Rounding) >> ;
temp[j] = (uchar)clamp(sum,(ushort),(ushort));
} uchar16 result,pre_result;
pre_result.sbcde = (uchar4)(temp[],temp[],temp[],temp[]);
pre_result.sf = temp[]; ushort16 line0 = convert_ushort16(vload16(,pInLine+j-nStart));
for ( ; (j+)<= (nWidth - nStart); j+=)
{
//prefetch(pInLine+j-nStart,32); //无变化
ushort16 line1 = convert_ushort16(vload16(,pInLine+j-nStart+)); ushort16 temp0;
ushort16 temp1;
temp0 = line0;
temp1.s0123 = line0.sabcd;
temp1.s45 = line0.sef;
temp1.s67 = line1.s01;
temp1.s89abcdef = line1.s23456789;
ushort16 sum = ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s0;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s9;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s1;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s8;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s2;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s7;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s3;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s6;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s4;
sum += ( temp0 ) * m_nFilter[]; sum += (ushort)Rounding;
line0 = line1; result.s0123 = pre_result.sbcde;
result.s4 = pre_result.sf;
pre_result = convert_uchar16(sum>>(ushort)) ; result.s5 = pre_result.s0;
result.s67 = pre_result.s12;
result.s89abcdef = pre_result.s3456789a;
*( (__global uint4*)(pOutLine+j-) ) = (as_uint4)(result) ;
} *( (__global uint*)(pOutLine+j-) ) = (as_uint)(pre_result.sbcde);//last 5 bytes
pOutLine[j-] = pre_result.sf; for( ; j < nWidth; j ++)
{
ushort sum = ; for (int m = ; m<s / ; m++)
{
int k1 = (j + m - nStart); int k2 = (j + nStart - m );
k2 = k2 >= nWidth ? * nWidth - - k2 : k2;
sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
}
sum += pInLine[j] * m_nFilter[s / ];
sum = (sum + Rounding) >> m_nRightShiftNum;
pOutLine[j] = (uchar)clamp(sum,(ushort),(ushort));
}
}
opencl gauss filter优化(一)的更多相关文章
- opencl gauss filter优化(三)
1.根据前两次的最终结果: 使用普通buffer,Horizontal 5ms, Vertical 17 ms 使用image buffer:Horizontal 9.4ms, Vertical 6. ...
- opencl gauss filter优化(二)
1.buffer使用image的方式:Horizontal 与 Vertical 算法一样, 共需30ms,wait time 19ms. const sampler_t sampler = CLK_ ...
- Anisotropic gauss filter
最近一直在做版面分析,其中文本行检测方面,许多文章涉及到了Anigauss也就是各向异性高斯滤波. 顾名思义,简单的理解就是参数不同的二维高斯滤波. 在文章Fast Anisotropic Gauss ...
- OpenCL Kernel设计优化
使用Intel® FPGA SDK for OpenCL™ 离线编译器,不需要调整kernel代码便可以将其最佳的适应于固定的硬件设备,而是离线编译器会根据kernel的要求自适应调整硬件的结构. 通 ...
- FILTER优化
explain plan for select a.* from fxqd_list_20131115_new_100 a where (acct_no, oper_no, seqno, trans_ ...
- 二维高斯滤波器(gauss filter)的实现
我们以一个二维矩阵表示二元高斯滤波器,显然此二维矩阵的具体形式仅于其形状(shape)有关: def gauss_filter(kernel_shape): 为实现二维高斯滤波器,需要首先定义二元高斯 ...
- 一次性能优化将filter转换
有一条SQL性能有问题,在运行计划中发现filter.遇到它要小心了,类似于nestloop.我曾经的blog对它有研究探索运行计划中filter的原理.用exists极易引起filter. 优化前: ...
- 安卓平台ARM Mali OpenCL例子-灰度转换(转)
手头一块RK3288的板子,在板子上测试了一张1080p的彩色图灰度转换的OpenCL例子.OpenCL没有任何优化.例子请移步这里. 该例子是编译成安卓平台下的可执行程序. 进入jni文件夹,进行如 ...
- OpenCV、OpenCL、OpenGL、OpenPCL
对于几个开源库的总结,作为标记,以前看过,现在开始重视起来!更详细资料请移步 开源中国社区! 涉及:OpenCV,OpenCL,OpenGL,OpenPCL 截止到目前: OpenGL的最新版本为4. ...
随机推荐
- Animator组件关闭再打开后参数丢失问题
问题如下,因为再激活Animator时,它会重置一次,参数也会丢失 这个问题一直存在,论坛给出的解释是把参数缓存下来,在激活时重置 http://answers.unity3d.com/questio ...
- WPF中父子窗口的层次关系
关于子窗体的层级关系总结一下哈,希望能对大家有些帮助 假设有这样两个窗体:RootWindow,SubWindow,在RootWindow中引发某事件而显示SubWindow 1,如果弹出窗体(比如S ...
- dubbo源码之三-模块依赖
dubbo版本:2.5.4 参照:http://www.tuicool.com/articles/qIN36ff
- profile、bashrc、bash_profile之间的区别和联系
/etc/profile:此文件为系统的每个用户设置环境信息,当用户第一次登录时,该文件被执行.并从/etc/profile.d目录的配置文件中搜集shell的设置. 英文描述为: # /etc/pr ...
- 高通APQ8074 spi 接口配置
高通APQ8074 spi 接口配置 8074 平台含有两个BLSP(BAM Low-Speed Peripheral) , 每一个BLSP含有两个QUP, 每一个QUP可以被配置为I2C, SPI, ...
- C# POST与Get数据
引用DLL 普通Get数据和Post数据 public static string Get(string URL) { String ReCode = string.Empty; try { Http ...
- TI公司Tina-ti和FilterProDesktop下载地址
http://www.ti.com/tool/tina-ti http://focus.ti.com/en/download/aap/DesignEnv/FilterPro-DT/FilterProD ...
- BDC、CATT批量数据维护
声明:原创作品,转载时请注明文章来自SAP师太技术博客( 博/客/园www.cnblogs.com):www.cnblogs.com/jiangzhengjun,并以超链接形式标明文章原始出处,否则将 ...
- poj 3335(半平面交)
链接:http://poj.org/problem?id=3335 //大牛们常说的测模板题 ------------------------------------------------- ...
- js 定时器的使用。 setInterval()
我需要实现的功能是:点击发送按钮,会出现 “已发送60s后可点击重发”,并且,60s 这个数字是随时变化的,60,59,58,57....0,然后再次返回到 发送 按钮. 类似效果,可参考 360首 ...