测试结果:

sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389

#include <stdio.h>
#include <xmmintrin.h>
#define NOMINMAX
#include <windows.h>
#include <math.h>
#include <time.h> /*
* (c) Ian Stephenson
*
* ian@dctsystems.co.uk
*
* Fast pow() reference implementation
*/ /*
* http://www.dctsystems.co.uk/Software/power.html
* http://www.dctsystems.co.uk/Software/power.c
*/
const float shift23=(<<);
const float OOshift23=1.0/(<<); __forceinline float myFloorf(float a)
{
return (float)((int)a - (a < 0.0f));
} __forceinline float myLog2(float i)
{
float LogBodge=0.346607f;
float x;
float y;
x=(float)(*(int *)&i);
x*= OOshift23; //1/pow(2,23);
x=x-; y=x-myFloorf(x);
y=(y-y*y)*LogBodge;
return x+y;
}
__forceinline float myPow2(float i)
{
float PowBodge=0.33971f;
float x;
float y=i-myFloorf(i);
y=(y-y*y)*PowBodge; x=i+-y;
x*= shift23; //pow(2,23);
*(int*)&x=(int)x;
return x;
} __forceinline float myPow(float a, float b)
{
return myPow2(b*myLog2(a));
} /////////////////////////////////////// /* Code below are from http://code.google.com/p/fastapprox/ */
__forceinline float fastpow2(float p)
{
float offset = (p < ) ? 1.0f : 0.0f;
float clipp = (p < -) ? -126.0f : p;
int w = (int)clipp;
float z = clipp - w + offset;
union { unsigned int i; float f; } v = { (unsigned int)(( << ) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };
return v.f;
} __forceinline float fastlog2(float x)
{
union { float f; unsigned int i; } vx = { x };
union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
float y = (float)vx.i;
y *= 1.1920928955078125e-7f;
return y - 124.22551499f
- 1.498030302f * mx.f
- 1.72587999f / (0.3520887068f + mx.f);
} __forceinline float fastpow(float x, float p)
{
return fastpow2(p * fastlog2(x));
} ///////////////////////////////////////////////// #define FLT_MIN 1.175494351e-38F
#define FLT_MAX 3.402823466e+38F template <typename T>
__forceinline T min(T a, T b)
{
return ((a < b) ? a : b);
} __forceinline float fast_fabs(float x)
{
union { float f; unsigned int i; } v = {x};
v.i &= 0x7FFFFFFF;
return v.f;
} /// Multiply and add: (a * b) + c
template <typename T>
__forceinline T madd (const T& a, const T& b, const T& c) {
// NOTE: in the future we may want to explicitly ask for a fused
// multiply-add in a specialized version for float.
// NOTE2: GCC/ICC will turn this (for float) into a FMA unless
// explicitly asked not to, clang seems to leave the code alone.
return a * b + c;
} template <typename IN_TYPE, typename OUT_TYPE>
__forceinline OUT_TYPE bit_cast (const IN_TYPE in) {
union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;
cvt.in_val = in;
return cvt.out_val;
} __forceinline float fast_log2 (float x) {
// NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
if (x < FLT_MIN) x = FLT_MIN;
if (x > FLT_MAX) x = FLT_MAX;
// based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
unsigned bits = bit_cast<float, unsigned>(x);
int exponent = int(bits >> ) - ;
float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
// Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
// ulp histogram:
// 0 = 97.46%
// 1 = 2.29%
// 2 = 0.11%
float f2 = f * f;
float f4 = f2 * f2;
float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
hi = madd(f, hi, -0.13753123777116f);
hi = madd(f, hi, 0.24187369696082f);
hi = madd(f, hi, -0.34730547155299f);
lo = madd(f, lo, 1.442689881667200f);
return ((f4 * hi) + (f * lo)) + exponent;
} __forceinline float fast_exp2 (float x) {
// clamp to safe range for final addition
if (x < -126.0f) x = -126.0f;
if (x > 126.0f) x = 126.0f;
// range reduction
int m = int(x); x -= m;
x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
// 5th degree polynomial generated with sollya
// Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
// ulp histogram:
// 0 = 87.81%
// 1 = 4.18%
float r = 1.33336498402e-3f;
r = madd(x, r, 9.810352697968e-3f);
r = madd(x, r, 5.551834031939e-2f);
r = madd(x, r, 0.2401793301105f);
r = madd(x, r, 0.693144857883f);
r = madd(x, r, 1.0f);
// multiply by 2 ^ m by adding in the exponent
// NOTE: left-shift of negative number is undefined behavior
return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << ));
} __forceinline float fast_safe_pow (float x, float y) {
if (y == ) return 1.0f; // x^0=1
if (x == ) return 0.0f; // 0^y=0
// be cheap & exact for special case of squaring and identity
if (y == 1.0f)
return x;
if (y == 2.0f)
return min (x*x, FLT_MAX);
float sign = 1.0f;
if (x < ) {
// if x is negative, only deal with integer powers
// powf returns NaN for non-integers, we will return 0 instead
int ybits = bit_cast<float, int>(y) & 0x7fffffff;
if (ybits >= 0x4b800000) {
// always even int, keep positive
} else if (ybits >= 0x3f800000) {
// bigger than 1, check
int k = (ybits >> ) - ; // get exponent
int j = ybits >> ( - k); // shift out possible fractional bits
if ((j << ( - k)) == ybits) // rebuild number and check for a match
sign = bit_cast<int, float>(0x3f800000 | (j << )); // +1 for even, -1 for odd
else
return 0.0f; // not integer
} else {
return 0.0f; // not integer
}
}
return sign * fast_exp2(y * fast_log2(fast_fabs(x)));
} /////////
int main(int argc, char *argv[])
{
const int N = ;
float *buf = new float[N];
float *a = new float[N];
float *b = new float[N];
float *c = new float[N];
float *d = new float[N];
for (int i = ; i < N; ++i)
{
buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;
} int start_time; start_time = clock();
for (int i = ; i < N; ++i)
{
a[i] = myPow(buf[i], 0.8f);
}
printf("sum (fast) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
c[i] = fastpow(buf[i], 0.8f);
}
printf("sum (fast2) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
d[i] = fast_safe_pow(buf[i], 0.8f);
}
printf("sum (fast3) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
b[i] = powf(buf[i], 0.8f);
}
printf("sum in clock %d\n", clock() - start_time); float max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(a[i] - b[i]);
if (err > max_err)
max_err = err;
}
printf("Error is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - c[i]);
if (err > max_err)
max_err = err;
}
printf("Error2 is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - d[i]);
if (err > max_err)
max_err = err;
}
printf("Error3 is %f\n", max_err); delete[]buf;
delete[]a;
delete[]b;
delete[]c;
delete[]d;
return ;
}

fast powf的更多相关文章

  1. opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较

    opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较 参考: http://wenku.baidu.com/link?url=1aDYAJBCrrK-uk2w3sSNai7h52x_ ...

  2. 基于Fast Bilateral Filtering 算法的 High-Dynamic Range(HDR) 图像显示技术。

    一.引言 本人初次接触HDR方面的知识,有描述不正确的地方烦请见谅. 为方便文章描述,引用部分百度中的文章对HDR图像进行简单的描述. 高动态范围图像(High-Dynamic Range,简称HDR ...

  3. Fast RCNN 训练自己的数据集(3训练和检测)

    转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ https://github.com/YihangLou/fas ...

  4. Fast RCNN 训练自己数据集 (2修改数据读取接口)

    Fast RCNN训练自己的数据集 (2修改读写接口) 转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ http ...

  5. 网格弹簧质点系统模拟(Spring-Mass System by Fast Method)附源码

    弹簧质点模型的求解方法包括显式欧拉积分和隐式欧拉积分等方法,其中显式欧拉积分求解快速,但积分步长小,两个可视帧之间需要多次积分,而隐式欧拉积分则需要求解线性方程组,但其稳定性好,能够取较大的积分步长. ...

  6. XiangBai——【AAAI2017】TextBoxes_A Fast Text Detector with a Single Deep Neural Network

    XiangBai--[AAAI2017]TextBoxes:A Fast Text Detector with a Single Deep Neural Network 目录 作者和相关链接 方法概括 ...

  7. 论文笔记--Fast RCNN

    很久之前试着写一篇深度学习的基础知识,无奈下笔之后发现这个话题确实太大,今天发一篇最近看的论文Fast RCNN.这篇文章是微软研究院的Ross Girshick大神的一篇作品,主要是对RCNN的一些 ...

  8. [转]Amazon DynamoDB – a Fast and Scalable NoSQL Database Service Designed for Internet Scale Applications

    This article is from blog of Amazon CTO Werner Vogels. -------------------- Today is a very exciting ...

  9. FAST特征点检测features2D

    #include <opencv2/core/core.hpp> #include <opencv2/features2d/features2d.hpp> #include & ...

随机推荐

  1. 如何在ecplise中配置maven以及ecplise访问本地仓库

    1.m2e的插件 因为使用ecplise版本比较高,所以它自带了maven的插件,但是我们希望可以使用我们自己指定的maven.配置步骤如下: ecplise--->preperences下,点 ...

  2. .NET中的Request

    获得浏览器中的URL 例:http://121.41.30.93:8010/ch/spell.aspx?id=58 Request.Url.PathAndQuery:/ch/spell.aspx?id ...

  3. beego启动找不到conf的原因

    beego配置文件路径如下: app.conf内容 httpaddr = "192.168.199.178" httpport = appname = SecProxy runmo ...

  4. Laravel Nginx 站点配置文件(Homestead)

    server {     listen 80;     listen 443 ssl http2;     server_name fmtmis.local;     root "/home ...

  5. asio 广播代码示例

    代码网络收集 修改了一个编译的小问题 客户端 // Client.cpp : 定义控制台应用程序的入口点. // #include "stdafx.h" #include < ...

  6. 使用Application Center Test (ACT)来做压力测试 【转】

    在我们完成了基于SPS2003的开发,实现了我们的具体应用以后,我们是不是就可以直接请用户来使用了呢?如果我这么做,那么有经验的开发人员一定会对此嗤之以鼻:居然连压力测试也不做!真是不想活了…… 呵呵 ...

  7. Python鸭子类型思想

    动态语言中经常提到鸭子类型,所谓鸭子类型就是:如果走起路来像鸭子,叫起来也像鸭子,那么它就是鸭子(If it walks like a duck and quacks like a duck, it ...

  8. 自然语言处理--中文文本向量化counterVectorizer()

    1.载入文档 #!/usr/bin/python # -*- coding: utf-8 -*- import pandas as pd import re import jieba from skl ...

  9. 《沉静领导》读书笔记zz

    就 像作者说的,这本书“只是一篇简单的随笔,它描绘并阐明了一种关于领导之道的思考方式,并且为把这种思考方式应用到实际行动中提供了指南.”但是,仔细想 来,倒有一点不同见解,或许,它描述的不可以叫做“领 ...

  10. 2018.10.23 hdu4745Two Rabbits(区间dp)

    传送门 区间dp经典题目. 首先断环为链. 然后题目相当于就是在找最大的回文子序列. 注意两个位置重合的时候相当于范围是n,不重合时范围是n-1. 代码: #include<bits/stdc+ ...