【源码阅读】opencv中opencl版本的dft函数的实现细节

1.函数声明

opencv-3.4.3\modules\core\include\opencv2\core.hpp:2157

CV_EXPORTS_W void dft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);

2.函数实现

opencv-3.4.3\modules\core\src\dxt.cpp:3315

void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )

{

    CV_INSTRUMENT_REGION()

#ifdef HAVE_CLAMDFFT

    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&

            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,

               ocl_dft_amdfft(_src0, _dst, flags))

#endif

#ifdef HAVE_OPENCL

    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,

               ocl_dft(_src0, _dst, flags, nonzero_rows))

#endif

    Mat src0 = _src0.getMat(), src = src0;

    bool inv = (flags & DFT_INVERSE) != 0;

    int type = src.type();

    int depth = src.depth();

    CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );

    // Fail if DFT_COMPLEX_INPUT is specified, but src is not 2 channels.

    CV_Assert( !((flags & DFT_COMPLEX_INPUT) && src.channels() != 2) );

    if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )

        _dst.create( src.size(), CV_MAKETYPE(depth, 2) );

    else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )

        _dst.create( src.size(), depth );

    else

        _dst.create( src.size(), type );

    Mat dst = _dst.getMat();

    int f = 0;

    if (src.isContinuous() && dst.isContinuous())

        f |= CV_HAL_DFT_IS_CONTINUOUS;

    if (inv)

        f |= CV_HAL_DFT_INVERSE;

    if (flags & DFT_ROWS)

        f |= CV_HAL_DFT_ROWS;

    if (flags & DFT_SCALE)

        f |= CV_HAL_DFT_SCALE;

    if (src.data == dst.data)

        f |= CV_HAL_DFT_IS_INPLACE;

    Ptr<hal::DFT2D> c = hal::DFT2D::create(src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);

    c->apply(src.data, src.step, dst.data, dst.step);

}

3. opencl的调用

#ifdef HAVE_OPENCL

    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,

               ocl_dft(_src0, _dst, flags, nonzero_rows))

#endif

ocl的函数实现：

opencv-3.4.3\modules\core\src\dxt.cpp:2161

static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)

{

    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);

    Size ssize = _src.size();

    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

    if (!(cn == 1 || cn == 2)

        || !(depth == CV_32F || (depth == CV_64F && doubleSupport))

        || ((flags & DFT_REAL_OUTPUT) && (flags & DFT_COMPLEX_OUTPUT)))

        return false;

    // if is not a multiplication of prime numbers { 2, 3, 5 }

    if (ssize.area() != getOptimalDFTSize(ssize.area()))

        return false;

    UMat src = _src.getUMat();

    bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;

    if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )

        nonzero_rows = _src.rows();

    bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;

    FftType fftType = determineFFTType(cn == 1, cn == 2,

        (flags & DFT_REAL_OUTPUT) != 0, (flags & DFT_COMPLEX_OUTPUT) != 0, inv);

    UMat output;

    if (fftType == C2C || fftType == R2C)

    {

        // complex output

        _dst.create(src.size(), CV_MAKETYPE(depth, 2));

        output = _dst.getUMat();

    }

    else

    {

        // real output

        if (is1d)

        {

            _dst.create(src.size(), CV_MAKETYPE(depth, 1));

            output = _dst.getUMat();

        }

        else

        {

            _dst.create(src.size(), CV_MAKETYPE(depth, 1));

            output.create(src.size(), CV_MAKETYPE(depth, 2));

        }

    }

    bool result = false;

    if (!inv)

    {

        int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;

        result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);

        if (!is1d)

            result = result && ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType);

    }

    else

    {

        if (fftType == C2C)

        {

            // complex output

            result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);

            if (!is1d)

                result = result && ocl_dft_cols(output, output, output.cols, flags, fftType);

        }

        else

        {

            if (is1d)

            {

                result = ocl_dft_rows(src, output, nonzero_rows, flags, fftType);

            }

            else

            {

                int nonzero_cols = src.cols/2 + 1;

                result = ocl_dft_cols(src, output, nonzero_cols, flags, fftType);

                result = result && ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType);

            }

        }

    }

    return result;

}

4.ocl_dft()里面的row/col的调用函数

函数原型：

static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)

static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)

看其中一个的源码：

static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)

{

    int type = _src.type(), depth = CV_MAT_DEPTH(type);

    Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), depth);

    return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);

}

5.fft计算的对象池

每个确定尺寸的fft计算之前，需要建立一系列的初始化数据；如果每次计算相同尺寸都建立这些初始化数据，明显很浪费。

于是建立一个对象池，每出现一个fft计算的新尺寸，就缓存一个对象。空间换时间（但是长期运行场景要注意内存消耗）。

    Ptr<OCL_FftPlan> OCL_FftPlanCache::getFftPlan(int dft_size, int depth)

    {

        int key = (dft_size << 16) | (depth & 0xFFFF);

        std::map<int, Ptr<OCL_FftPlan> >::iterator f = planStorage.find(key);

        if (f != planStorage.end())

        {

            return f->second;

        }

        else

        {

            Ptr<OCL_FftPlan> newPlan = Ptr<OCL_FftPlan>(new OCL_FftPlan(dft_size, depth));

            planStorage[key] = newPlan;

            return newPlan;

        }

    }

6. fft对象

opencv-3.4.3\modules\core\src\dxt.cpp:1881

struct OCL_FftPlan

初始化在构造函数：OCL_FftPlan(int _size, int _depth)

计算使用这个方法： bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true) const

方法的主要代码是构造核函数的编译参数。

6.1 opencl核函数的编译、绑定参数、执行

enqueueTransform()方法的核心代码如下：

        ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);

        if (k.empty())

            return false;

        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::ReadOnlyNoSize(twiddles), thread_count, num_dfts);

        return k.run(2, globalsize, localsize, false);

ocl::Kernel 对象用于编译opencl的核函数。

ocl::KernelArg 用于绑定核函数的执行参数。

k.run() 执行核函数。

6.2 核函数的定义

ocl::core::fft_oclsrc 这个常量对象定义了核函数的源码，搜索了所有的.h, .hpp, .cpp都没有找到定义。

源码这部分代码是编译过程生成的。

定义在：

opencv-3.4.3/build/modules/core/opencl_kernels_core.hpp:21

extern struct cv::ocl::internal::ProgramEntry fft_oclsrc;

实现在：

opencv-3.4.3/build/modules/core/opencl_kernels_core.cpp:770

struct cv::ocl::internal::ProgramEntry fft_oclsrc={moduleName, "fft",

"#define SQRT_2 0.707106781188f\n"

看来只是用一个脚本，把opencl的核函数代码转换成为C++字符串而已。

6.3 核函数的定义文件

最终找到opencl fft的核函数的文件：

opencv-3.4.3\modules\core\src\opencl\fft.cl

这里有一个明显的问题，核函数每次调用都要编译一次。并未看见哪里缓存了编译的结果。

7.cv::dft()可能的优化点

每次调用核函数都要编译，应该缓存ocl::Kernel对象
把C函数的风格修改为面向对象风格，把UMat数据upload/核函数运行/UMat数据download等部分都加入异步队列。使得连续计算多个dft()的时候，可以避免CPU等待GPU的结果。