fast powf
测试结果:
sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389
#include <stdio.h>
#include <xmmintrin.h>
#define NOMINMAX
#include <windows.h>
#include <math.h>
#include <time.h> /*
* (c) Ian Stephenson
*
* ian@dctsystems.co.uk
*
* Fast pow() reference implementation
*/ /*
* http://www.dctsystems.co.uk/Software/power.html
* http://www.dctsystems.co.uk/Software/power.c
*/
const float shift23=(<<);
const float OOshift23=1.0/(<<); __forceinline float myFloorf(float a)
{
return (float)((int)a - (a < 0.0f));
} __forceinline float myLog2(float i)
{
float LogBodge=0.346607f;
float x;
float y;
x=(float)(*(int *)&i);
x*= OOshift23; //1/pow(2,23);
x=x-; y=x-myFloorf(x);
y=(y-y*y)*LogBodge;
return x+y;
}
__forceinline float myPow2(float i)
{
float PowBodge=0.33971f;
float x;
float y=i-myFloorf(i);
y=(y-y*y)*PowBodge; x=i+-y;
x*= shift23; //pow(2,23);
*(int*)&x=(int)x;
return x;
} __forceinline float myPow(float a, float b)
{
return myPow2(b*myLog2(a));
} /////////////////////////////////////// /* Code below are from http://code.google.com/p/fastapprox/ */
__forceinline float fastpow2(float p)
{
float offset = (p < ) ? 1.0f : 0.0f;
float clipp = (p < -) ? -126.0f : p;
int w = (int)clipp;
float z = clipp - w + offset;
union { unsigned int i; float f; } v = { (unsigned int)(( << ) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };
return v.f;
} __forceinline float fastlog2(float x)
{
union { float f; unsigned int i; } vx = { x };
union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
float y = (float)vx.i;
y *= 1.1920928955078125e-7f;
return y - 124.22551499f
- 1.498030302f * mx.f
- 1.72587999f / (0.3520887068f + mx.f);
} __forceinline float fastpow(float x, float p)
{
return fastpow2(p * fastlog2(x));
} ///////////////////////////////////////////////// #define FLT_MIN 1.175494351e-38F
#define FLT_MAX 3.402823466e+38F template <typename T>
__forceinline T min(T a, T b)
{
return ((a < b) ? a : b);
} __forceinline float fast_fabs(float x)
{
union { float f; unsigned int i; } v = {x};
v.i &= 0x7FFFFFFF;
return v.f;
} /// Multiply and add: (a * b) + c
template <typename T>
__forceinline T madd (const T& a, const T& b, const T& c) {
// NOTE: in the future we may want to explicitly ask for a fused
// multiply-add in a specialized version for float.
// NOTE2: GCC/ICC will turn this (for float) into a FMA unless
// explicitly asked not to, clang seems to leave the code alone.
return a * b + c;
} template <typename IN_TYPE, typename OUT_TYPE>
__forceinline OUT_TYPE bit_cast (const IN_TYPE in) {
union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;
cvt.in_val = in;
return cvt.out_val;
} __forceinline float fast_log2 (float x) {
// NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
if (x < FLT_MIN) x = FLT_MIN;
if (x > FLT_MAX) x = FLT_MAX;
// based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
unsigned bits = bit_cast<float, unsigned>(x);
int exponent = int(bits >> ) - ;
float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
// Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
// ulp histogram:
// 0 = 97.46%
// 1 = 2.29%
// 2 = 0.11%
float f2 = f * f;
float f4 = f2 * f2;
float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
hi = madd(f, hi, -0.13753123777116f);
hi = madd(f, hi, 0.24187369696082f);
hi = madd(f, hi, -0.34730547155299f);
lo = madd(f, lo, 1.442689881667200f);
return ((f4 * hi) + (f * lo)) + exponent;
} __forceinline float fast_exp2 (float x) {
// clamp to safe range for final addition
if (x < -126.0f) x = -126.0f;
if (x > 126.0f) x = 126.0f;
// range reduction
int m = int(x); x -= m;
x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
// 5th degree polynomial generated with sollya
// Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
// ulp histogram:
// 0 = 87.81%
// 1 = 4.18%
float r = 1.33336498402e-3f;
r = madd(x, r, 9.810352697968e-3f);
r = madd(x, r, 5.551834031939e-2f);
r = madd(x, r, 0.2401793301105f);
r = madd(x, r, 0.693144857883f);
r = madd(x, r, 1.0f);
// multiply by 2 ^ m by adding in the exponent
// NOTE: left-shift of negative number is undefined behavior
return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << ));
} __forceinline float fast_safe_pow (float x, float y) {
if (y == ) return 1.0f; // x^0=1
if (x == ) return 0.0f; // 0^y=0
// be cheap & exact for special case of squaring and identity
if (y == 1.0f)
return x;
if (y == 2.0f)
return min (x*x, FLT_MAX);
float sign = 1.0f;
if (x < ) {
// if x is negative, only deal with integer powers
// powf returns NaN for non-integers, we will return 0 instead
int ybits = bit_cast<float, int>(y) & 0x7fffffff;
if (ybits >= 0x4b800000) {
// always even int, keep positive
} else if (ybits >= 0x3f800000) {
// bigger than 1, check
int k = (ybits >> ) - ; // get exponent
int j = ybits >> ( - k); // shift out possible fractional bits
if ((j << ( - k)) == ybits) // rebuild number and check for a match
sign = bit_cast<int, float>(0x3f800000 | (j << )); // +1 for even, -1 for odd
else
return 0.0f; // not integer
} else {
return 0.0f; // not integer
}
}
return sign * fast_exp2(y * fast_log2(fast_fabs(x)));
} /////////
int main(int argc, char *argv[])
{
const int N = ;
float *buf = new float[N];
float *a = new float[N];
float *b = new float[N];
float *c = new float[N];
float *d = new float[N];
for (int i = ; i < N; ++i)
{
buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;
} int start_time; start_time = clock();
for (int i = ; i < N; ++i)
{
a[i] = myPow(buf[i], 0.8f);
}
printf("sum (fast) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
c[i] = fastpow(buf[i], 0.8f);
}
printf("sum (fast2) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
d[i] = fast_safe_pow(buf[i], 0.8f);
}
printf("sum (fast3) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
b[i] = powf(buf[i], 0.8f);
}
printf("sum in clock %d\n", clock() - start_time); float max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(a[i] - b[i]);
if (err > max_err)
max_err = err;
}
printf("Error is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - c[i]);
if (err > max_err)
max_err = err;
}
printf("Error2 is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - d[i]);
if (err > max_err)
max_err = err;
}
printf("Error3 is %f\n", max_err); delete[]buf;
delete[]a;
delete[]b;
delete[]c;
delete[]d;
return ;
}
fast powf的更多相关文章
- opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较
opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较 参考: http://wenku.baidu.com/link?url=1aDYAJBCrrK-uk2w3sSNai7h52x_ ...
- 基于Fast Bilateral Filtering 算法的 High-Dynamic Range(HDR) 图像显示技术。
一.引言 本人初次接触HDR方面的知识,有描述不正确的地方烦请见谅. 为方便文章描述,引用部分百度中的文章对HDR图像进行简单的描述. 高动态范围图像(High-Dynamic Range,简称HDR ...
- Fast RCNN 训练自己的数据集(3训练和检测)
转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ https://github.com/YihangLou/fas ...
- Fast RCNN 训练自己数据集 (2修改数据读取接口)
Fast RCNN训练自己的数据集 (2修改读写接口) 转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ http ...
- 网格弹簧质点系统模拟(Spring-Mass System by Fast Method)附源码
弹簧质点模型的求解方法包括显式欧拉积分和隐式欧拉积分等方法,其中显式欧拉积分求解快速,但积分步长小,两个可视帧之间需要多次积分,而隐式欧拉积分则需要求解线性方程组,但其稳定性好,能够取较大的积分步长. ...
- XiangBai——【AAAI2017】TextBoxes_A Fast Text Detector with a Single Deep Neural Network
XiangBai--[AAAI2017]TextBoxes:A Fast Text Detector with a Single Deep Neural Network 目录 作者和相关链接 方法概括 ...
- 论文笔记--Fast RCNN
很久之前试着写一篇深度学习的基础知识,无奈下笔之后发现这个话题确实太大,今天发一篇最近看的论文Fast RCNN.这篇文章是微软研究院的Ross Girshick大神的一篇作品,主要是对RCNN的一些 ...
- [转]Amazon DynamoDB – a Fast and Scalable NoSQL Database Service Designed for Internet Scale Applications
This article is from blog of Amazon CTO Werner Vogels. -------------------- Today is a very exciting ...
- FAST特征点检测features2D
#include <opencv2/core/core.hpp> #include <opencv2/features2d/features2d.hpp> #include & ...
随机推荐
- 什么场景应该用 MongoDB(转)
很多人比较关心 MongoDB 的适用场景,也有用户在话题里分享了自己的业务场景,比如: 案例1 用在应用服务器的日志记录,查找起来比文本灵活,导出也很方便.也是给应用练手,从外围系统开始使用Mong ...
- cron,at的权限控制
/etc/cron.deny存在 /etc/cron.deny不存在 /etc/cron.allow存在 只有/etc/cron.allow中列出的用户才能运行crontab -e:忽略/etc/ ...
- PAT 1065 单身狗(25)(STL-map+思路+测试点分析)
1065 单身狗(25 分) "单身狗"是中文对于单身人士的一种爱称.本题请你从上万人的大型派对中找出落单的客人,以便给予特殊关爱. 输入格式: 输入第一行给出一个正整数 N(≤ ...
- Maven系列(二)exec-maven-plugin
Maven系列(二)exec-maven-plugin 1. mvn 命令行运行 # exec:java 不会自动编译代码,你需要手动执行 mvn compile 来完成编译 mvn compile ...
- iis 应用程序预热
<applicationPools> <add name="appname" managedRuntimeVersion="v4.0" sta ...
- 低配NOSQL
东西写的太简单了 都不好意思说是NOSQL 其实就是STL 的map容器记录了写入的信息 解析了下数据仅此. 分析的时候想了很多 比如学习redis的自写hash,动态调整hash表容量. 比如右值或 ...
- ftp sftp vsftp
ftp sftp (secure) 是文件传输 协议 vsftp(very secure) 是 ftp 服务端 sftp 是 ssh 的一部分
- java实现word,ppt,excel,jpg转pdf
word,excel,jpeg 转 pdf 首先下载相关jar包:http://download.csdn.net/detail/xu281828044/6922499 import java.io. ...
- 2018.09.22 牧场的安排(状压dp)
描述 农民 John 购买了一处肥沃的矩形牧场,分成M*N(1 <= M <= 12; 1 <= N <= 12)个 格子.他想在那里的一些格子中种植美味的玉米.遗憾的是,有些 ...
- springboot问题,没有主清单属性
在pom.xml中添加 <build> <plugins> <plugin> <groupId>org.apache.maven.plugins< ...