测试结果:

sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389

#include <stdio.h>
#include <xmmintrin.h>
#define NOMINMAX
#include <windows.h>
#include <math.h>
#include <time.h> /*
* (c) Ian Stephenson
*
* ian@dctsystems.co.uk
*
* Fast pow() reference implementation
*/ /*
* http://www.dctsystems.co.uk/Software/power.html
* http://www.dctsystems.co.uk/Software/power.c
*/
const float shift23=(<<);
const float OOshift23=1.0/(<<); __forceinline float myFloorf(float a)
{
return (float)((int)a - (a < 0.0f));
} __forceinline float myLog2(float i)
{
float LogBodge=0.346607f;
float x;
float y;
x=(float)(*(int *)&i);
x*= OOshift23; //1/pow(2,23);
x=x-; y=x-myFloorf(x);
y=(y-y*y)*LogBodge;
return x+y;
}
__forceinline float myPow2(float i)
{
float PowBodge=0.33971f;
float x;
float y=i-myFloorf(i);
y=(y-y*y)*PowBodge; x=i+-y;
x*= shift23; //pow(2,23);
*(int*)&x=(int)x;
return x;
} __forceinline float myPow(float a, float b)
{
return myPow2(b*myLog2(a));
} /////////////////////////////////////// /* Code below are from http://code.google.com/p/fastapprox/ */
__forceinline float fastpow2(float p)
{
float offset = (p < ) ? 1.0f : 0.0f;
float clipp = (p < -) ? -126.0f : p;
int w = (int)clipp;
float z = clipp - w + offset;
union { unsigned int i; float f; } v = { (unsigned int)(( << ) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };
return v.f;
} __forceinline float fastlog2(float x)
{
union { float f; unsigned int i; } vx = { x };
union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
float y = (float)vx.i;
y *= 1.1920928955078125e-7f;
return y - 124.22551499f
- 1.498030302f * mx.f
- 1.72587999f / (0.3520887068f + mx.f);
} __forceinline float fastpow(float x, float p)
{
return fastpow2(p * fastlog2(x));
} ///////////////////////////////////////////////// #define FLT_MIN 1.175494351e-38F
#define FLT_MAX 3.402823466e+38F template <typename T>
__forceinline T min(T a, T b)
{
return ((a < b) ? a : b);
} __forceinline float fast_fabs(float x)
{
union { float f; unsigned int i; } v = {x};
v.i &= 0x7FFFFFFF;
return v.f;
} /// Multiply and add: (a * b) + c
template <typename T>
__forceinline T madd (const T& a, const T& b, const T& c) {
// NOTE: in the future we may want to explicitly ask for a fused
// multiply-add in a specialized version for float.
// NOTE2: GCC/ICC will turn this (for float) into a FMA unless
// explicitly asked not to, clang seems to leave the code alone.
return a * b + c;
} template <typename IN_TYPE, typename OUT_TYPE>
__forceinline OUT_TYPE bit_cast (const IN_TYPE in) {
union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;
cvt.in_val = in;
return cvt.out_val;
} __forceinline float fast_log2 (float x) {
// NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
if (x < FLT_MIN) x = FLT_MIN;
if (x > FLT_MAX) x = FLT_MAX;
// based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
unsigned bits = bit_cast<float, unsigned>(x);
int exponent = int(bits >> ) - ;
float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
// Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
// ulp histogram:
// 0 = 97.46%
// 1 = 2.29%
// 2 = 0.11%
float f2 = f * f;
float f4 = f2 * f2;
float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
hi = madd(f, hi, -0.13753123777116f);
hi = madd(f, hi, 0.24187369696082f);
hi = madd(f, hi, -0.34730547155299f);
lo = madd(f, lo, 1.442689881667200f);
return ((f4 * hi) + (f * lo)) + exponent;
} __forceinline float fast_exp2 (float x) {
// clamp to safe range for final addition
if (x < -126.0f) x = -126.0f;
if (x > 126.0f) x = 126.0f;
// range reduction
int m = int(x); x -= m;
x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
// 5th degree polynomial generated with sollya
// Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
// ulp histogram:
// 0 = 87.81%
// 1 = 4.18%
float r = 1.33336498402e-3f;
r = madd(x, r, 9.810352697968e-3f);
r = madd(x, r, 5.551834031939e-2f);
r = madd(x, r, 0.2401793301105f);
r = madd(x, r, 0.693144857883f);
r = madd(x, r, 1.0f);
// multiply by 2 ^ m by adding in the exponent
// NOTE: left-shift of negative number is undefined behavior
return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << ));
} __forceinline float fast_safe_pow (float x, float y) {
if (y == ) return 1.0f; // x^0=1
if (x == ) return 0.0f; // 0^y=0
// be cheap & exact for special case of squaring and identity
if (y == 1.0f)
return x;
if (y == 2.0f)
return min (x*x, FLT_MAX);
float sign = 1.0f;
if (x < ) {
// if x is negative, only deal with integer powers
// powf returns NaN for non-integers, we will return 0 instead
int ybits = bit_cast<float, int>(y) & 0x7fffffff;
if (ybits >= 0x4b800000) {
// always even int, keep positive
} else if (ybits >= 0x3f800000) {
// bigger than 1, check
int k = (ybits >> ) - ; // get exponent
int j = ybits >> ( - k); // shift out possible fractional bits
if ((j << ( - k)) == ybits) // rebuild number and check for a match
sign = bit_cast<int, float>(0x3f800000 | (j << )); // +1 for even, -1 for odd
else
return 0.0f; // not integer
} else {
return 0.0f; // not integer
}
}
return sign * fast_exp2(y * fast_log2(fast_fabs(x)));
} /////////
int main(int argc, char *argv[])
{
const int N = ;
float *buf = new float[N];
float *a = new float[N];
float *b = new float[N];
float *c = new float[N];
float *d = new float[N];
for (int i = ; i < N; ++i)
{
buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;
} int start_time; start_time = clock();
for (int i = ; i < N; ++i)
{
a[i] = myPow(buf[i], 0.8f);
}
printf("sum (fast) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
c[i] = fastpow(buf[i], 0.8f);
}
printf("sum (fast2) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
d[i] = fast_safe_pow(buf[i], 0.8f);
}
printf("sum (fast3) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
b[i] = powf(buf[i], 0.8f);
}
printf("sum in clock %d\n", clock() - start_time); float max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(a[i] - b[i]);
if (err > max_err)
max_err = err;
}
printf("Error is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - c[i]);
if (err > max_err)
max_err = err;
}
printf("Error2 is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - d[i]);
if (err > max_err)
max_err = err;
}
printf("Error3 is %f\n", max_err); delete[]buf;
delete[]a;
delete[]b;
delete[]c;
delete[]d;
return ;
}

fast powf的更多相关文章

  1. opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较

    opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较 参考: http://wenku.baidu.com/link?url=1aDYAJBCrrK-uk2w3sSNai7h52x_ ...

  2. 基于Fast Bilateral Filtering 算法的 High-Dynamic Range(HDR) 图像显示技术。

    一.引言 本人初次接触HDR方面的知识,有描述不正确的地方烦请见谅. 为方便文章描述,引用部分百度中的文章对HDR图像进行简单的描述. 高动态范围图像(High-Dynamic Range,简称HDR ...

  3. Fast RCNN 训练自己的数据集(3训练和检测)

    转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ https://github.com/YihangLou/fas ...

  4. Fast RCNN 训练自己数据集 (2修改数据读取接口)

    Fast RCNN训练自己的数据集 (2修改读写接口) 转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ http ...

  5. 网格弹簧质点系统模拟(Spring-Mass System by Fast Method)附源码

    弹簧质点模型的求解方法包括显式欧拉积分和隐式欧拉积分等方法,其中显式欧拉积分求解快速,但积分步长小,两个可视帧之间需要多次积分,而隐式欧拉积分则需要求解线性方程组,但其稳定性好,能够取较大的积分步长. ...

  6. XiangBai——【AAAI2017】TextBoxes_A Fast Text Detector with a Single Deep Neural Network

    XiangBai--[AAAI2017]TextBoxes:A Fast Text Detector with a Single Deep Neural Network 目录 作者和相关链接 方法概括 ...

  7. 论文笔记--Fast RCNN

    很久之前试着写一篇深度学习的基础知识,无奈下笔之后发现这个话题确实太大,今天发一篇最近看的论文Fast RCNN.这篇文章是微软研究院的Ross Girshick大神的一篇作品,主要是对RCNN的一些 ...

  8. [转]Amazon DynamoDB – a Fast and Scalable NoSQL Database Service Designed for Internet Scale Applications

    This article is from blog of Amazon CTO Werner Vogels. -------------------- Today is a very exciting ...

  9. FAST特征点检测features2D

    #include <opencv2/core/core.hpp> #include <opencv2/features2d/features2d.hpp> #include & ...

随机推荐

  1. 二叉树的锯齿形层次遍历 · Binary Tree Zigzag Level Order Traversal

    [抄题]: 给出一棵二叉树,返回其节点值的锯齿形层次遍历(先从左往右,下一层再从右往左,层与层之间交替进行) [思维问题]: 不知道反复切换要怎么做:用boolean normalOrder当作布尔型 ...

  2. OC 线程操作2 - NSThread

        方法1 :直接创建 alloc init - (void)createNSThread111{ /* 参数1: (nonnull id) 目标对象 self 参数2:(nonnull SEL) ...

  3. spring框架的概述与入门

    1. Spring框架的概述 * Spring是一个开源框架 * Spring是于2003 年兴起的一个轻量级的Java开发框架,由Rod Johnson在其著作Expert One-On-One J ...

  4. 别人家的PS系列又来了!!!

    又到了“别人的PS”系列的日常感叹了,大家请边看推文边组织语言准备留言,用点新鲜词,不要再说什么给跪了,献上膝盖之类的,争取换点词. 好了,废话不多说,开始正文,先看几则简单的PS作品: 这两组作品出 ...

  5. 任务取消TASK

    using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.T ...

  6. 2018.10.20 bzoj1079: [SCOI2008]着色方案(多维dp)

    传送门 dp妙题. f[a][b][c][d][e][last]f[a][b][c][d][e][last]f[a][b][c][d][e][last]表示还剩下aaa个可以用一次的,还剩下bbb个可 ...

  7. 如何使用 Visual C# 2005 或 Visual C# .NET 向 Excel 工作簿传输数据

    本文分步介绍了多种从 Microsoft Visual C# 2005 或 Microsoft Visual C# .NET 程序向 Microsoft Excel 2002 传输数据的方法.本文还提 ...

  8. python编码(二)

    谈谈Unicode编码,简要解释UCS.UTF.BMP.BOM等名词 问题一 使用Windows记事本的“另存为”,可以在GBK.Unicode.Unicode big endian和UTF-8这几种 ...

  9. 整数重复的第n位计算公式

    513不停的重复形成513513513....,求第n位是几的计算公式.

  10. Perf -- Linux下的系统性能调优工具,第 1 部分

    Perf 简介 Perf 是用来进行软件性能分析的工具. 通过它,应用程序可以利用 PMU,tracepoint 和内核中的特殊计数器来进行性能统计.它不但可以分析指定应用程序的性能问题 (per t ...