opencv 源码分析 CUDA可分离滤波器设计 ( 发现OpenCV的cuda真TM慢 )
1. 主函数
void SeparableLinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
{
GpuMat src = _src.getGpuMat();
CV_Assert( src.type() == srcType_ ); _dst.create(src.size(), dstType_);
GpuMat dst = _dst.getGpuMat(); ensureSizeIsEnough(src.size(), bufType_, buf_); DeviceInfo devInfo;
const int cc = devInfo.majorVersion() * + devInfo.minorVersion(); cudaStream_t stream = StreamAccessor::getStream(_stream); rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
}
the block of col is 16X16 , the block of row is 32X8
2. COL
namespace filter
{
template <typename T, typename D>
void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream); static const caller_t callers[][] =
{
{
,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller< , T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>,
column_filter::caller<, T, D, BrdColConstant>
},
{
,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller< , T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>,
column_filter::caller<, T, D, BrdColReplicate>
},
{
,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller< , T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>,
column_filter::caller<, T, D, BrdColReflect>
},
{
,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller< , T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>,
column_filter::caller<, T, D, BrdColWrap>
},
{
,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller< , T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>,
column_filter::caller<, T, D, BrdColReflect101>
}
}; callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
}
}
template <int KSIZE, typename T, typename D, template<typename> class B>
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
{
int BLOCK_DIM_X;
int BLOCK_DIM_Y;
int PATCH_PER_BLOCK; if (cc >= )
{
BLOCK_DIM_X = 16;
BLOCK_DIM_Y = 16;
PATCH_PER_BLOCK = 4;
}
else
{
BLOCK_DIM_X = 16;
BLOCK_DIM_Y = 8;
PATCH_PER_BLOCK = 2;
} const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); B<T> brd(src.rows); linearColumnFilter<KSIZE, T, D><<<grid, block, , stream>>>(src, dst, kernel, anchor, brd); cudaSafeCall( cudaGetLastError() ); if (stream == )
cudaSafeCall( cudaDeviceSynchronize() );
}
}
#define MAX_KERNEL_SIZE 32 template <int KSIZE, typename T, typename D, typename B>
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
const int BLOCK_DIM_X = ;
const int BLOCK_DIM_Y = ;
const int PATCH_PER_BLOCK = ;
const int HALO_SIZE = KSIZE <= ? : ;
#else
const int BLOCK_DIM_X = ;
const int BLOCK_DIM_Y = ;
const int PATCH_PER_BLOCK = ;
const int HALO_SIZE = ;
#endif typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; __shared__ sum_t smem[(PATCH_PER_BLOCK + * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; if (x >= src.cols)
return; const T* src_col = src.ptr() + x; const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; if (blockIdx.y > )
{
//Upper halo
#pragma unroll
for (int j = ; j < HALO_SIZE; ++j)
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
}
else
{
//Upper halo
#pragma unroll
for (int j = ; j < HALO_SIZE; ++j)
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
} if (blockIdx.y + < gridDim.y)
{
//Main data
#pragma unroll
for (int j = ; j < PATCH_PER_BLOCK; ++j)
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x)); //Lower halo
#pragma unroll
for (int j = ; j < HALO_SIZE; ++j)
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
}
else
{
//Main data
#pragma unroll
for (int j = ; j < PATCH_PER_BLOCK; ++j)
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); //Lower halo
#pragma unroll
for (int j = ; j < HALO_SIZE; ++j)
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
} __syncthreads(); #pragma unroll
for (int j = ; j < PATCH_PER_BLOCK; ++j)
{
const int y = yStart + j * BLOCK_DIM_Y; if (y < src.rows)
{
sum_t sum = VecTraits<sum_t>::all(); #pragma unroll
for (int k = ; k < KSIZE; ++k)
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k]; dst(y, x) = saturate_cast<D>(sum);
}
}
}
3. ROW
namespace filter
{
template <typename T, typename D>
void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream); static const caller_t callers[][] =
{
{
,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller< , T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>,
row_filter::caller<, T, D, BrdRowConstant>
},
{
,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller< , T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>,
row_filter::caller<, T, D, BrdRowReplicate>
},
{
,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller< , T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>,
row_filter::caller<, T, D, BrdRowReflect>
},
{
,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller< , T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>,
row_filter::caller<, T, D, BrdRowWrap>
},
{
,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller< , T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>,
row_filter::caller<, T, D, BrdRowReflect101>
}
}; callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
}
}
template <int KSIZE, typename T, typename D, template<typename> class B>
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
{
int BLOCK_DIM_X;
int BLOCK_DIM_Y;
int PATCH_PER_BLOCK; if (cc >= )
{
BLOCK_DIM_X = 32;
BLOCK_DIM_Y = 8;
PATCH_PER_BLOCK = 4;
}
else
{
BLOCK_DIM_X = ;
BLOCK_DIM_Y = ;
PATCH_PER_BLOCK = ;
} const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); B<T> brd(src.cols); linearRowFilter<KSIZE, T, D><<<grid, block, , stream>>>(src, dst, kernel, anchor, brd);
cudaSafeCall( cudaGetLastError() ); if (stream == )
cudaSafeCall( cudaDeviceSynchronize() );
}
#define MAX_KERNEL_SIZE 32 template <int KSIZE, typename T, typename D, typename B>
__global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
const int BLOCK_DIM_X = ;
const int BLOCK_DIM_Y = ;
const int PATCH_PER_BLOCK = ;
const int HALO_SIZE = ;
#else
const int BLOCK_DIM_X = ;
const int BLOCK_DIM_Y = ;
const int PATCH_PER_BLOCK = ;
const int HALO_SIZE = ;
#endif typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + * HALO_SIZE) * BLOCK_DIM_X]; const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; if (y >= src.rows)
return; const T* src_row = src.ptr(y); const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; if (blockIdx.x > )
{
//Load left halo
#pragma unroll
for (int j = ; j < HALO_SIZE; ++j)
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
}
else
{
//Load left halo
#pragma unroll
for (int j = ; j < HALO_SIZE; ++j)
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
} if (blockIdx.x + < gridDim.x)
{
//Load main data
#pragma unroll
for (int j = ; j < PATCH_PER_BLOCK; ++j)
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]); //Load right halo
#pragma unroll
for (int j = ; j < HALO_SIZE; ++j)
smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
}
else
{
//Load main data
#pragma unroll
for (int j = ; j < PATCH_PER_BLOCK; ++j)
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); //Load right halo
#pragma unroll
for (int j = ; j < HALO_SIZE; ++j)
smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
} __syncthreads(); #pragma unroll
for (int j = ; j < PATCH_PER_BLOCK; ++j)
{
const int x = xStart + j * BLOCK_DIM_X; if (x < src.cols)
{
sum_t sum = VecTraits<sum_t>::all(); #pragma unroll
for (int k = ; k < KSIZE; ++k)
sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k]; dst(y, x) = saturate_cast<D>(sum);
}
}
}
opencv 源码分析 CUDA可分离滤波器设计 ( 发现OpenCV的cuda真TM慢 )的更多相关文章
- OpenCV源码分析:RGB到其他色彩空间的转换
1.流程调用图 2.部分代码分析 //模板函数进行颜色空间的转换 template <typename Cvt> void CvtColorLoop(const Mat& src, ...
- jQuery源码分析系列
声明:本文为原创文章,如需转载,请注明来源并保留原文链接Aaron,谢谢! 版本截止到2013.8.24 jQuery官方发布最新的的2.0.3为准 附上每一章的源码注释分析 :https://git ...
- Redis学习——ae事件处理源码分析
0. 前言 Redis在封装事件的处理采用了Reactor模式,添加了定时事件的处理.Redis处理事件是单进程单线程的,而经典Reator模式对事件是串行处理的.即如果有一个事件阻塞过久的话会导致整 ...
- [转]jQuery源码分析系列
文章转自:jQuery源码分析系列-Aaron 版本截止到2013.8.24 jQuery官方发布最新的的2.0.3为准 附上每一章的源码注释分析 :https://github.com/JsAaro ...
- jQuery源码分析系列——来自Aaron
jQuery源码分析系列——来自Aaron 转载地址:http://www.cnblogs.com/aaronjs/p/3279314.html 版本截止到2013.8.24 jQuery官方发布最新 ...
- MQTT再学习 -- MQTT 客户端源码分析
MQTT 源码分析,搜索了一下发现网络上讲的很少,多是逍遥子的那几篇. 参看:逍遥子_mosquitto源码分析系列 参看:MQTT libmosquitto源码分析 参看:Mosquitto学习笔记 ...
- Visual Studio调试到OpenCV源码中
TL;DR VS2015下,build-farm/vs2015-x64/bin/Debug/目录,*.pdb文件,都拷贝到install/x64/vc14/bin目录,就可以调试进去opencv源码了 ...
- springmvc拦截器入门及其执行顺序源码分析
springmvc拦截器是偶尔会用到的一个功能,本案例来演示一个较简单的springmvc拦截器的使用,并通过源码来分析拦截器的执行顺序的控制.具体操作步骤为:1.maven项目引入spring依赖2 ...
- druid 源码分析与学习(含详细监控设计思路的彩蛋)(转)
原文路径:http://herman-liu76.iteye.com/blog/2308563 Druid是阿里巴巴公司的数据库连接池工具,昨天突然想学习一下阿里的druid源码,于是下载下来分析了 ...
随机推荐
- spring boot validation参数校验
对于任何一个应用而言在客户端做的数据有效性验证都不是安全有效的,这时候就要求我们在开发的时候在服务端也对数据的有效性进行验证. Spring Boot自身对数据在服务端的校验有一个比较好的支持,它能将 ...
- SDN上机第三次作业
1. 利用Mininet仿真平台构建如下图所示的网络拓扑,配置主机h1和h2的IP地址(h1:10.0.0.1,h2:10.0.0.2),测试两台主机之间的网络连通性 使用miniedit进行创建操作 ...
- 20189220 余超《Linux内核原理与分析》第四周作业
构造一个简单的Linux系统MenuOS 第三章基础知识 计算机的三大法宝:存储计算机,函数调用堆栈,中断. 操作系统的两把宝剑:中断上下文,进程上下文. Linux内核源码的目录结构: arch目录 ...
- RUN vs CMD vs ENTRYPOINT
参考:https://www.ibm.com/developerworks/community/blogs/132cfa78-44b0-4376-85d0-d3096cd30d3f/entry/RUN ...
- Python17个常用内置模块总结
Python17个常用内置模块总结 1.getpass 2.os 3.sys 4.subprocess 5.hashlib 6.json 7.pickle 8.shutil 9.time 10.dat ...
- gogs 邀请协作者 500错误
触发原因: 对db文件的user表删了某个用户导致 解决: 注册个新用户,把id改成原来的id(默认都会自增长)
- java8之Spliterator
基本用法: import java.util.Arrays; import java.util.Spliterator; import java.util.stream.IntStream; publ ...
- 026_如何在MAC下输入主要国家货币符号?
由于出国旅游啥的经常会记录一些东西,不避免的会遇到各种货币符号 一. 人民币: ¥(sogo输入法切换到中文模式,然后"shift键 + 4"即可) 美元: $(sogo输入法切换 ...
- linux设置程序运行超时时间
在某些情况下,我们需要限制程序的运行时间(比如cronjob等),这里简单介绍下使用信号及timeout的实现方法 1. 假如有如下代码(test_timout.sh): #!/bin/bash wh ...
- 013-centos7 常用命令--查看当前用户的4种方法
一.概述 4种查看系统用户信息(通过编号(ID))的方法. 1.1. 使用w命令查看登录用户正在使用的进程信息 w命令用于显示已经登录系统的用户的名称,以及他们正在做的事.该命令所使用的信息来源于/v ...