fast powf

测试结果：

sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389

#include <stdio.h>

#include <xmmintrin.h>

#define NOMINMAX

#include <windows.h>

#include <math.h>

#include <time.h>

/*

 * (c) Ian Stephenson

 *

 * ian@dctsystems.co.uk

 *

 * Fast pow() reference implementation

 */

/*

 * http://www.dctsystems.co.uk/Software/power.html

 * http://www.dctsystems.co.uk/Software/power.c

 */

const float shift23=(<<);

const float OOshift23=1.0/(<<);

__forceinline float myFloorf(float a)

{

    return (float)((int)a - (a < 0.0f));

}

__forceinline float myLog2(float i)

    {

    float LogBodge=0.346607f;

    float x;

    float y;

    x=(float)(*(int *)&i);

    x*= OOshift23; //1/pow(2,23);

    x=x-;

    y=x-myFloorf(x);

    y=(y-y*y)*LogBodge;

    return x+y;

    }

__forceinline float myPow2(float i)

    {

    float PowBodge=0.33971f;

    float x;

    float y=i-myFloorf(i);

    y=(y-y*y)*PowBodge;

    x=i+-y;

    x*= shift23; //pow(2,23);

    *(int*)&x=(int)x;

    return x;

    }

__forceinline float myPow(float a, float b)

    {

    return myPow2(b*myLog2(a));

    }

///////////////////////////////////////

/* Code below are from http://code.google.com/p/fastapprox/ */

__forceinline float fastpow2(float p)

{

    float offset = (p < ) ? 1.0f : 0.0f;

    float clipp = (p < -) ? -126.0f : p;

    int w = (int)clipp;

    float z = clipp - w + offset;

    union { unsigned int i; float f; } v = { (unsigned int)(( << ) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };

    return v.f;

}

__forceinline float fastlog2(float x)

{

    union { float f; unsigned int i; } vx = { x };

    union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };

    float y = (float)vx.i;

    y *= 1.1920928955078125e-7f;

    return y - 124.22551499f

        - 1.498030302f * mx.f

        - 1.72587999f / (0.3520887068f + mx.f);

}

__forceinline float fastpow(float x, float p)

{

    return fastpow2(p * fastlog2(x));

}

/////////////////////////////////////////////////

#define FLT_MIN        1.175494351e-38F

#define FLT_MAX        3.402823466e+38F

template <typename T>

__forceinline T min(T a, T b)

{

    return ((a < b) ? a : b);

}

__forceinline float fast_fabs(float x)

{

    union { float f; unsigned int i; } v = {x};

    v.i &= 0x7FFFFFFF;

    return v.f;

}

/// Multiply and add: (a * b) + c

template <typename T>

__forceinline T madd (const T& a, const T& b, const T& c) {

    // NOTE:  in the future we may want to explicitly ask for a fused

    // multiply-add in a specialized version for float.

    // NOTE2: GCC/ICC will turn this (for float) into a FMA unless

    // explicitly asked not to, clang seems to leave the code alone.

    return a * b + c;

}

template <typename IN_TYPE, typename OUT_TYPE>

__forceinline OUT_TYPE bit_cast (const IN_TYPE in) {

    union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;

    cvt.in_val = in;

    return cvt.out_val;

}

__forceinline float fast_log2 (float x) {

    // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans

    if (x < FLT_MIN) x = FLT_MIN;

    if (x > FLT_MAX) x = FLT_MAX;

    // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h

    unsigned bits = bit_cast<float, unsigned>(x);

    int exponent = int(bits >> ) - ;

    float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;

    // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error

    // ulp histogram:

    //  0  = 97.46%

    //  1  =  2.29%

    //  2  =  0.11%

    float f2 = f * f;

    float f4 = f2 * f2;

    float hi = madd(f, -0.00931049621349f,  0.05206469089414f);

    float lo = madd(f,  0.47868480909345f, -0.72116591947498f);

    hi = madd(f, hi, -0.13753123777116f);

    hi = madd(f, hi,  0.24187369696082f);

    hi = madd(f, hi, -0.34730547155299f);

    lo = madd(f, lo,  1.442689881667200f);

    return ((f4 * hi) + (f * lo)) + exponent;

}

__forceinline float fast_exp2 (float x) {

    // clamp to safe range for final addition

    if (x < -126.0f) x = -126.0f;

    if (x >  126.0f) x =  126.0f;

    // range reduction

    int m = int(x); x -= m;

    x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)

    // 5th degree polynomial generated with sollya

    // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp

    // ulp histogram:

    //  0  = 87.81%

    //  1  =  4.18%

    float r = 1.33336498402e-3f;

    r = madd(x, r, 9.810352697968e-3f);

    r = madd(x, r, 5.551834031939e-2f);

    r = madd(x, r, 0.2401793301105f);

    r = madd(x, r, 0.693144857883f);

    r = madd(x, r, 1.0f);

    // multiply by 2 ^ m by adding in the exponent

    // NOTE: left-shift of negative number is undefined behavior

    return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << ));

}

__forceinline float fast_safe_pow (float x, float y) {

    if (y == ) return 1.0f; // x^0=1

    if (x == ) return 0.0f; // 0^y=0

    // be cheap & exact for special case of squaring and identity

    if (y == 1.0f)

        return x;

    if (y == 2.0f)

        return min (x*x, FLT_MAX);

    float sign = 1.0f;

    if (x < ) {

        // if x is negative, only deal with integer powers

        // powf returns NaN for non-integers, we will return 0 instead

        int ybits = bit_cast<float, int>(y) & 0x7fffffff;

        if (ybits >= 0x4b800000) {

            // always even int, keep positive

        } else if (ybits >= 0x3f800000) {

            // bigger than 1, check

            int k = (ybits >> ) - ;  // get exponent

            int j =  ybits >> ( - k);   // shift out possible fractional bits

            if ((j << ( - k)) == ybits) // rebuild number and check for a match

                sign = bit_cast<int, float>(0x3f800000 | (j << )); // +1 for even, -1 for odd

            else

                return 0.0f; // not integer

        } else {

            return 0.0f; // not integer

        }

    }

    return sign * fast_exp2(y * fast_log2(fast_fabs(x)));

}

/////////

int main(int argc, char *argv[])

{

    const int N = ;

    float *buf = new float[N];

    float *a = new float[N];

    float *b = new float[N];

    float *c = new float[N];

    float *d = new float[N];

    for (int i = ; i < N; ++i)

    {

        buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;

    }

    int start_time;

    start_time = clock();

    for (int i = ; i < N; ++i)

    {

        a[i] = myPow(buf[i], 0.8f);

    }

    printf("sum (fast) in clock %d\n", clock() - start_time);

    start_time = clock();

    for (int i = ; i < N; ++i)

    {

        c[i] = fastpow(buf[i], 0.8f);

    }

    printf("sum (fast2) in clock %d\n", clock() - start_time);

    start_time = clock();

    for (int i = ; i < N; ++i)

    {

        d[i] = fast_safe_pow(buf[i], 0.8f);

    }

    printf("sum (fast3) in clock %d\n", clock() - start_time);

    start_time = clock();

    for (int i = ; i < N; ++i)

    {

        b[i] = powf(buf[i], 0.8f);

    }

    printf("sum in clock %d\n", clock() - start_time);

    float max_err = 0.0f;

    for (int i = ; i < N; ++i)

    {

        float err = fabsf(a[i] - b[i]);

        if (err > max_err)

            max_err = err;

    }

    printf("Error is %f\n", max_err);

    max_err = 0.0f;

    for (int i = ; i < N; ++i)

    {

        float err = fabsf(b[i] - c[i]);

        if (err > max_err)

            max_err = err;

    }

    printf("Error2 is %f\n", max_err);

    max_err = 0.0f;

    for (int i = ; i < N; ++i)

    {

        float err = fabsf(b[i] - d[i]);

        if (err > max_err)

            max_err = err;

    }

    printf("Error3 is %f\n", max_err);

    delete[]buf;

    delete[]a;

    delete[]b;

    delete[]c;

    delete[]d;

    return ;

}

fast powf的更多相关文章

opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较
opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较参考: http://wenku.baidu.com/link?url=1aDYAJBCrrK-uk2w3sSNai7h52x_ ...
基于Fast Bilateral Filtering 算法的 High-Dynamic Range(HDR) 图像显示技术。
一.引言本人初次接触HDR方面的知识,有描述不正确的地方烦请见谅. 为方便文章描述,引用部分百度中的文章对HDR图像进行简单的描述. 高动态范围图像(High-Dynamic Range,简称HDR ...
Fast RCNN 训练自己的数据集（3训练和检测）
转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ https://github.com/YihangLou/fas ...
Fast RCNN 训练自己数据集 (2修改数据读取接口)
Fast RCNN训练自己的数据集 (2修改读写接口) 转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ http ...
网格弹簧质点系统模拟（Spring-Mass System by Fast Method）附源码
弹簧质点模型的求解方法包括显式欧拉积分和隐式欧拉积分等方法,其中显式欧拉积分求解快速,但积分步长小,两个可视帧之间需要多次积分,而隐式欧拉积分则需要求解线性方程组,但其稳定性好,能够取较大的积分步长. ...
XiangBai——【AAAI2017】TextBoxes_A Fast Text Detector with a Single Deep Neural Network
XiangBai--[AAAI2017]TextBoxes:A Fast Text Detector with a Single Deep Neural Network 目录作者和相关链接方法概括 ...
论文笔记--Fast RCNN
很久之前试着写一篇深度学习的基础知识,无奈下笔之后发现这个话题确实太大,今天发一篇最近看的论文Fast RCNN.这篇文章是微软研究院的Ross Girshick大神的一篇作品,主要是对RCNN的一些 ...
[转]Amazon DynamoDB – a Fast and Scalable NoSQL Database Service Designed for Internet Scale Applications
This article is from blog of Amazon CTO Werner Vogels. -------------------- Today is a very exciting ...
FAST特征点检测features2D
#include <opencv2/core/core.hpp> #include <opencv2/features2d/features2d.hpp> #include & ...

随机推荐

什么场景应该用 MongoDB（转）
很多人比较关心 MongoDB 的适用场景,也有用户在话题里分享了自己的业务场景,比如: 案例1 用在应用服务器的日志记录,查找起来比文本灵活,导出也很方便.也是给应用练手,从外围系统开始使用Mong ...
cron，at的权限控制
/etc/cron.deny存在 /etc/cron.deny不存在 /etc/cron.allow存在只有/etc/cron.allow中列出的用户才能运行crontab -e:忽略/etc/ ...
PAT 1065 单身狗（25）（STL-map+思路+测试点分析）
1065 单身狗(25 分) "单身狗"是中文对于单身人士的一种爱称.本题请你从上万人的大型派对中找出落单的客人,以便给予特殊关爱. 输入格式: 输入第一行给出一个正整数 N(≤ ...
Maven系列（二）exec-maven-plugin
Maven系列(二)exec-maven-plugin 1. mvn 命令行运行 # exec:java 不会自动编译代码,你需要手动执行 mvn compile 来完成编译 mvn compile ...
iis 应用程序预热
<applicationPools> <add name="appname" managedRuntimeVersion="v4.0" sta ...
低配NOSQL
东西写的太简单了都不好意思说是NOSQL 其实就是STL 的map容器记录了写入的信息解析了下数据仅此. 分析的时候想了很多比如学习redis的自写hash,动态调整hash表容量. 比如右值或 ...
ftp sftp vsftp
ftp sftp (secure) 是文件传输协议 vsftp(very secure) 是 ftp 服务端 sftp 是 ssh 的一部分
java实现word,ppt,excel,jpg转pdf
word,excel,jpeg 转 pdf 首先下载相关jar包:http://download.csdn.net/detail/xu281828044/6922499 import java.io. ...
2018.09.22 牧场的安排（状压dp）
描述农民 John 购买了一处肥沃的矩形牧场,分成M*N(1 <= M <= 12; 1 <= N <= 12)个格子.他想在那里的一些格子中种植美味的玉米.遗憾的是,有些 ...
springboot问题，没有主清单属性
在pom.xml中添加 <build> <plugins> <plugin> <groupId>org.apache.maven.plugins< ...

fast powf

fast powf的更多相关文章

随机推荐

热门专题