cuda yv12_to_rgb24
前言
项目需要将yv12转rgb24,由于基于x86平台,开始就没多想,直接用ipp加速实现了,后来在评估项目瓶颈的时候发现,1080p的视频每一帧转换居然要花8ms,刚好项目里有用到nvidia gtx960,因此就产生了直接用cuda实现一个yv12转rgb24的想法。
具体实施
我一向不喜欢造轮子,因此,第一步就是搜索有没有现成的代码。搜索了很久,包括opencv里都没找到yv12 to rgb24的,还好网上找到了一篇yv12 to argb的,我拿过来照着改改就ok了(包括代码风格及bug修复)。下面直接贴出代码,有任何疑问,可以留言讨论
#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include <stdio.h>
#define COLOR_COMPONENT_BIT_SIZE 10
#define COLOR_COMPONENT_MASK 0x3FF
__constant__ float const_hue_colorspace_mat[9]={1.1644f,0.0f,1.596f,1.1644f,-0.3918f,-0.813f,1.1644f,2.0172f,0.0f};
__device__ static void yuv2rgb(const int *yuvi, float *red, float *green,float *blue)
{
float luma, chromacb, chromacr;
// Prepare for hue adjustment
luma =(float)yuvi[0];
chromacb =(float)((int)yuvi[1]-512.0f);
chromacr =(float)((int)yuvi[2]-512.0f);
// Convert YUV To RGB with hue adjustment
*red = (luma * const_hue_colorspace_mat[0])+
(chromacb * const_hue_colorspace_mat[1])+
(chromacr * const_hue_colorspace_mat[2]);
*green = (luma * const_hue_colorspace_mat[3])+
(chromacb * const_hue_colorspace_mat[4])+
(chromacr * const_hue_colorspace_mat[5]);
*blue = (luma * const_hue_colorspace_mat[6])+
(chromacb * const_hue_colorspace_mat[7])+
(chromacr * const_hue_colorspace_mat[8]);
}
__global__ void yv12torgb24_fourpixel(const unsigned char *src, unsigned char *dst, int width, int height, int dst_pitch)
{
// Pad borders with duplicate pixels, and we multiply by 2 because we process 4 pixels per thread
const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
const int y = blockIdx.y * (blockDim.y << 1) + (threadIdx.y << 1);
if((x + 1) >= width ||(y + 1) >= height)
return;
// Read 4 Luma components at a time
int yuv101010Pel[4];
yuv101010Pel[0] = (src[y * width + x]) << 2;
yuv101010Pel[1] = (src[y * width + x + 1]) << 2;
yuv101010Pel[2] = (src[(y + 1)* width + x]) << 2;
yuv101010Pel[3] = (src[(y + 1)* width + x + 1]) << 2;
const unsigned int voffset = width * height;
const unsigned int uoffset = voffset + (voffset >> 2);
const unsigned int vpitch = width >> 1;
const unsigned int upitch = vpitch;
const int x_chroma = x >> 1;
const int y_chroma = y >> 1;
int chromaCb = src[uoffset + y_chroma * upitch + x_chroma]; //U
int chromaCr = src[voffset + y_chroma * vpitch + x_chroma]; //V
yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
yuv101010Pel[2] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[2] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
yuv101010Pel[3] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[3] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
// this steps performs the color conversion
int yuvi[12];
float red[4], green[4], blue[4];
yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
yuvi[6] = (yuv101010Pel[2] & COLOR_COMPONENT_MASK);
yuvi[7] = ((yuv101010Pel[2] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
yuvi[8] = ((yuv101010Pel[2] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
yuvi[9] = (yuv101010Pel[3] & COLOR_COMPONENT_MASK);
yuvi[10] = ((yuv101010Pel[3] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
yuvi[11] = ((yuv101010Pel[3] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
// YUV to RGB Transformation conversion
yuv2rgb(&yuvi[0], &red[0], &green[0], &blue[0]);
yuv2rgb(&yuvi[3], &red[1], &green[1], &blue[1]);
yuv2rgb(&yuvi[6], &red[2], &green[2], &blue[2]);
yuv2rgb(&yuvi[9], &red[3], &green[3], &blue[3]);
float _red, _green, _blue;
_red =::fmin(::fmax(red[0], 0.0f), 1023.f);
_green =::fmin(::fmax(green[0], 0.0f), 1023.f);
_blue =::fmin(::fmax(blue[0], 0.0f), 1023.f);
dst[y * dst_pitch + x*3 + 0] = (((unsigned int)_blue) & 0x3ff) >> 2;
dst[y * dst_pitch + x*3 + 1] = (((unsigned int)_green) & 0x3ff) >> 2;
dst[y * dst_pitch + x*3 + 2] = (((unsigned int)_red) & 0x3ff) >> 2;
_red =::fmin(::fmax(red[1], 0.0f), 1023.f);
_green =::fmin(::fmax(green[1], 0.0f), 1023.f);
_blue =::fmin(::fmax(blue[1], 0.0f), 1023.f);
dst[y * dst_pitch + x*3 + 3] = (((unsigned int)_blue) & 0x3ff) >> 2;
dst[y * dst_pitch + x*3 + 4] = (((unsigned int)_green) & 0x3ff) >> 2;
dst[y * dst_pitch + x*3 + 5] = (((unsigned int)_red) & 0x3ff) >> 2;
_red =::fmin(::fmax(red[2], 0.0f), 1023.f);
_green =::fmin(::fmax(green[2], 0.0f), 1023.f);
_blue =::fmin(::fmax(blue[2], 0.0f), 1023.f);
dst[(y+1) * dst_pitch + x*3 + 0] = (((unsigned int)_blue) & 0x3ff) >> 2;
dst[(y+1) * dst_pitch + x*3 + 1] = (((unsigned int)_green) & 0x3ff) >> 2;
dst[(y+1) * dst_pitch + x*3 + 2] = (((unsigned int)_red) & 0x3ff) >> 2;
_red =::fmin(::fmax(red[3], 0.0f), 1023.f);
_green =::fmin(::fmax(green[3], 0.0f), 1023.f);
_blue =::fmin(::fmax(blue[3], 0.0f), 1023.f);
dst[(y+1) * dst_pitch + x*3 + 3] = (((unsigned int)_blue) & 0x3ff) >> 2;
dst[(y+1) * dst_pitch + x*3 + 4] = (((unsigned int)_green) & 0x3ff) >> 2;
dst[(y+1) * dst_pitch + x*3 + 5] = (((unsigned int)_red) & 0x3ff) >> 2;
}
bool yv12_to_rgb24(unsigned char *src, unsigned char *dst,int src_width,int src_height, int dst_pitch)
{
unsigned char *d_src;
unsigned int src_mem_size = sizeof(unsigned char ) * src_width * src_height * 3/2;
dim3 block(32,8);
int gridx = (src_width +2*block.x -1)/(2*block.x);
int gridy = (src_height +2*block.y -1)/(2*block.y);
dim3 grid(gridx, gridy);
cudaMalloc((void**)&d_src,src_mem_size);
cudaMemcpy(d_src, src, src_mem_size, cudaMemcpyHostToDevice);
yv12torgb24_fourpixel<<<grid,block>>>(d_src, dst, src_width, src_height, dst_pitch);
cudaFree(d_src);
return true;
}
总结
经过cuda加速后的转换能够在1ms左右完成,还是比较理想的_
完!
2016年8月
cuda yv12_to_rgb24的更多相关文章
- CUDA[2] Hello,World
Section 0:Hello,World 这次我们亲自尝试一下如何用粗(CU)大(DA)写程序 CUDA最新版本是7.5,然而即使是最新版本也不兼容VS2015 ...推荐使用VS2012 进入VS ...
- CUDA[1] Introductory
Section 0 :Induction of CUDA CUDA是啥?CUDA®: A General-Purpose Parallel Computing Platform and Program ...
- Couldn't open CUDA library cublas64_80.dll etc. tensorflow-gpu on windows
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_load ...
- ubuntu 16.04 + N驱动安装 +CUDA+Qt5 + opencv
Nvidia driver installation(after download XX.run installation file) 1. ctrl+Alt+F1 //go to virtual ...
- 手把手教你搭建深度学习平台——避坑安装theano+CUDA
python有多混乱我就不多说了.这个混论不仅是指整个python市场混乱,更混乱的还有python的各种附加依赖包.为了一劳永逸解决python的各种依赖包对深度学习造成的影响,本文中采用pytho ...
- [CUDA] CUDA to DL
又是一枚祖国的骚年,阅览做做笔记:http://www.cnblogs.com/neopenx/p/4643705.html 这里只是一些基础知识.帮助理解DL tool的实现. “这也是深度学习带来 ...
- 基于Ubuntu14.04系统的nvidia tesla K40驱动和cuda 7.5安装笔记
基于Ubuntu14.04系统的nvidia tesla K40驱动和cuda 7.5安装笔记 飞翔的蜘蛛人 注1:本人新手,文章中不准确的地方,欢迎批评指正 注2:知识储备应达到Linux入门级水平 ...
- CUDA程序设计(一)
为什么需要GPU 几年前我启动并主导了一个项目,当时还在谷歌,这个项目叫谷歌大脑.该项目利用谷歌的计算基础设施来构建神经网络. 规模大概比之前的神经网络扩大了一百倍,我们的方法是用约一千台电脑.这确实 ...
- 使用 CUDA范例精解通用GPU编程 配套程序的方法
用vs新建一个cuda的项目,然后将系统自动生成的那个.cu里头的内容,除了头文件引用外,全部替代成先有代码的内容. 然后程序就能跑了. 因为新建的是cuda的项目,所以所有的头文件和库的引用系统都会 ...
随机推荐
- 第二十二篇 正在表达式 re模块
re模块****** 就本质而言,正则表达式时一种小型的,高度专业化的编程语言,在python里,它内嵌在python中,并通过re模块实现.正则表达式模式被编译成一系列的字节码.然后用C编写的匹配引 ...
- 4、shader透明测试(AlphaTest)
主要用于花草树木 用3D的Plane来实现透明的例子: 给Plane先赋予一个带alpha通道的透明图片,但是此图片此时是看不出来是透明的,如下: 现在我们要做的就是显示透明的效果:现在就用到了alp ...
- windows下网络命令----Tracert命令详解
现在网络四通八达,网线光纤基站卫星,只要运营商能收费的地方,就有网络,覆盖了全世界所有的区域.彻底改变了以前通讯基本靠吼的情况.那么宽广的网络世界,超过100米就得需要中继放大信号的网线,即使现在的光 ...
- 关于缺失值(missing value)的处理---机器学习 Imputer
关于缺失值(missing value)的处理 在sklearn的preprocessing包中包含了对数据集中缺失值的处理,主要是应用Imputer类进行处理. 首先需要说明的是,numpy的数组中 ...
- 甲级1002 A+B for Polynomials (25)
题目描述: This time, you are supposed to find A+B where A and B are two polynomials. Input Each input fi ...
- 关于socket的疑问
一直感觉一端发送数据,另一端接受数据很不可思议的事情,如果不能即时地读走会导致什么后果呢? 其实socket读出来的数据,你自己看着办,里面的数据是什么格式你自己去解析,用户可以基于TCP去实现你自己 ...
- password & Encryption
password & Encryption cipher https://dev.tencent.com/login
- 2018牛客多校第一场 D.Two Graphs
题意: n个点,m1条边的图E1,n个点,m2条边的图E2.求图E2有多少子图跟图E1同构. 题解: 用STL的全排列函数next_permutation()枚举映射.对于每一种映射枚举每一条边判断合 ...
- [bzoj] 2694 Lcm || 莫比乌斯反演
原题 定义整数a,b,求所有满足条件的lcm(a,b)的和: 1<=a<=A 1<=b<=B ∀n>1,n2†gcd(a,b)(即任意n>1,\(n^2\)不是gc ...
- C++——继承时的this指针
1.this指针只在类的成员函数中使用,当类的成员函数需要用到自己的指针时就要用到this指针.但静态函数不能使用this关键字,其解释是:因为this是个引用,哪个对象调用方法就引用哪个对象. 而静 ...