前言

  项目需要将yv12转rgb24,由于基于x86平台,开始就没多想,直接用ipp加速实现了,后来在评估项目瓶颈的时候发现,1080p的视频每一帧转换居然要花8ms,刚好项目里有用到nvidia gtx960,因此就产生了直接用cuda实现一个yv12转rgb24的想法。

具体实施

  我一向不喜欢造轮子,因此,第一步就是搜索有没有现成的代码。搜索了很久,包括opencv里都没找到yv12 to rgb24的,还好网上找到了一篇yv12 to argb的,我拿过来照着改改就ok了(包括代码风格及bug修复)。下面直接贴出代码,有任何疑问,可以留言讨论

  1. #include "cuda.h"
  2. #include "cuda_runtime.h"
  3. #include "cuda_runtime_api.h"
  4. #include <stdio.h>
  5. #define COLOR_COMPONENT_BIT_SIZE 10
  6. #define COLOR_COMPONENT_MASK 0x3FF
  7. __constant__ float const_hue_colorspace_mat[9]={1.1644f,0.0f,1.596f,1.1644f,-0.3918f,-0.813f,1.1644f,2.0172f,0.0f};
  8. __device__ static void yuv2rgb(const int *yuvi, float *red, float *green,float *blue)
  9. {
  10. float luma, chromacb, chromacr;
  11. // Prepare for hue adjustment
  12. luma =(float)yuvi[0];
  13. chromacb =(float)((int)yuvi[1]-512.0f);
  14. chromacr =(float)((int)yuvi[2]-512.0f);
  15. // Convert YUV To RGB with hue adjustment
  16. *red = (luma * const_hue_colorspace_mat[0])+
  17. (chromacb * const_hue_colorspace_mat[1])+
  18. (chromacr * const_hue_colorspace_mat[2]);
  19. *green = (luma * const_hue_colorspace_mat[3])+
  20. (chromacb * const_hue_colorspace_mat[4])+
  21. (chromacr * const_hue_colorspace_mat[5]);
  22. *blue = (luma * const_hue_colorspace_mat[6])+
  23. (chromacb * const_hue_colorspace_mat[7])+
  24. (chromacr * const_hue_colorspace_mat[8]);
  25. }
  26. __global__ void yv12torgb24_fourpixel(const unsigned char *src, unsigned char *dst, int width, int height, int dst_pitch)
  27. {
  28. // Pad borders with duplicate pixels, and we multiply by 2 because we process 4 pixels per thread
  29. const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
  30. const int y = blockIdx.y * (blockDim.y << 1) + (threadIdx.y << 1);
  31. if((x + 1) >= width ||(y + 1) >= height)
  32. return;
  33. // Read 4 Luma components at a time
  34. int yuv101010Pel[4];
  35. yuv101010Pel[0] = (src[y * width + x]) << 2;
  36. yuv101010Pel[1] = (src[y * width + x + 1]) << 2;
  37. yuv101010Pel[2] = (src[(y + 1)* width + x]) << 2;
  38. yuv101010Pel[3] = (src[(y + 1)* width + x + 1]) << 2;
  39. const unsigned int voffset = width * height;
  40. const unsigned int uoffset = voffset + (voffset >> 2);
  41. const unsigned int vpitch = width >> 1;
  42. const unsigned int upitch = vpitch;
  43. const int x_chroma = x >> 1;
  44. const int y_chroma = y >> 1;
  45. int chromaCb = src[uoffset + y_chroma * upitch + x_chroma]; //U
  46. int chromaCr = src[voffset + y_chroma * vpitch + x_chroma]; //V
  47. yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
  48. yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
  49. yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
  50. yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
  51. yuv101010Pel[2] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
  52. yuv101010Pel[2] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
  53. yuv101010Pel[3] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
  54. yuv101010Pel[3] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
  55. // this steps performs the color conversion
  56. int yuvi[12];
  57. float red[4], green[4], blue[4];
  58. yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
  59. yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
  60. yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
  61. yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
  62. yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
  63. yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
  64. yuvi[6] = (yuv101010Pel[2] & COLOR_COMPONENT_MASK);
  65. yuvi[7] = ((yuv101010Pel[2] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
  66. yuvi[8] = ((yuv101010Pel[2] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
  67. yuvi[9] = (yuv101010Pel[3] & COLOR_COMPONENT_MASK);
  68. yuvi[10] = ((yuv101010Pel[3] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
  69. yuvi[11] = ((yuv101010Pel[3] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
  70. // YUV to RGB Transformation conversion
  71. yuv2rgb(&yuvi[0], &red[0], &green[0], &blue[0]);
  72. yuv2rgb(&yuvi[3], &red[1], &green[1], &blue[1]);
  73. yuv2rgb(&yuvi[6], &red[2], &green[2], &blue[2]);
  74. yuv2rgb(&yuvi[9], &red[3], &green[3], &blue[3]);
  75. float _red, _green, _blue;
  76. _red =::fmin(::fmax(red[0], 0.0f), 1023.f);
  77. _green =::fmin(::fmax(green[0], 0.0f), 1023.f);
  78. _blue =::fmin(::fmax(blue[0], 0.0f), 1023.f);
  79. dst[y * dst_pitch + x*3 + 0] = (((unsigned int)_blue) & 0x3ff) >> 2;
  80. dst[y * dst_pitch + x*3 + 1] = (((unsigned int)_green) & 0x3ff) >> 2;
  81. dst[y * dst_pitch + x*3 + 2] = (((unsigned int)_red) & 0x3ff) >> 2;
  82. _red =::fmin(::fmax(red[1], 0.0f), 1023.f);
  83. _green =::fmin(::fmax(green[1], 0.0f), 1023.f);
  84. _blue =::fmin(::fmax(blue[1], 0.0f), 1023.f);
  85. dst[y * dst_pitch + x*3 + 3] = (((unsigned int)_blue) & 0x3ff) >> 2;
  86. dst[y * dst_pitch + x*3 + 4] = (((unsigned int)_green) & 0x3ff) >> 2;
  87. dst[y * dst_pitch + x*3 + 5] = (((unsigned int)_red) & 0x3ff) >> 2;
  88. _red =::fmin(::fmax(red[2], 0.0f), 1023.f);
  89. _green =::fmin(::fmax(green[2], 0.0f), 1023.f);
  90. _blue =::fmin(::fmax(blue[2], 0.0f), 1023.f);
  91. dst[(y+1) * dst_pitch + x*3 + 0] = (((unsigned int)_blue) & 0x3ff) >> 2;
  92. dst[(y+1) * dst_pitch + x*3 + 1] = (((unsigned int)_green) & 0x3ff) >> 2;
  93. dst[(y+1) * dst_pitch + x*3 + 2] = (((unsigned int)_red) & 0x3ff) >> 2;
  94. _red =::fmin(::fmax(red[3], 0.0f), 1023.f);
  95. _green =::fmin(::fmax(green[3], 0.0f), 1023.f);
  96. _blue =::fmin(::fmax(blue[3], 0.0f), 1023.f);
  97. dst[(y+1) * dst_pitch + x*3 + 3] = (((unsigned int)_blue) & 0x3ff) >> 2;
  98. dst[(y+1) * dst_pitch + x*3 + 4] = (((unsigned int)_green) & 0x3ff) >> 2;
  99. dst[(y+1) * dst_pitch + x*3 + 5] = (((unsigned int)_red) & 0x3ff) >> 2;
  100. }
  101. bool yv12_to_rgb24(unsigned char *src, unsigned char *dst,int src_width,int src_height, int dst_pitch)
  102. {
  103. unsigned char *d_src;
  104. unsigned int src_mem_size = sizeof(unsigned char ) * src_width * src_height * 3/2;
  105. dim3 block(32,8);
  106. int gridx = (src_width +2*block.x -1)/(2*block.x);
  107. int gridy = (src_height +2*block.y -1)/(2*block.y);
  108. dim3 grid(gridx, gridy);
  109. cudaMalloc((void**)&d_src,src_mem_size);
  110. cudaMemcpy(d_src, src, src_mem_size, cudaMemcpyHostToDevice);
  111. yv12torgb24_fourpixel<<<grid,block>>>(d_src, dst, src_width, src_height, dst_pitch);
  112. cudaFree(d_src);
  113. return true;
  114. }

总结

经过cuda加速后的转换能够在1ms左右完成,还是比较理想的_

完!

2016年8月

cuda yv12_to_rgb24的更多相关文章

  1. CUDA[2] Hello,World

    Section 0:Hello,World 这次我们亲自尝试一下如何用粗(CU)大(DA)写程序 CUDA最新版本是7.5,然而即使是最新版本也不兼容VS2015 ...推荐使用VS2012 进入VS ...

  2. CUDA[1] Introductory

    Section 0 :Induction of CUDA CUDA是啥?CUDA®: A General-Purpose Parallel Computing Platform and Program ...

  3. Couldn't open CUDA library cublas64_80.dll etc. tensorflow-gpu on windows

    I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_load ...

  4. ubuntu 16.04 + N驱动安装 +CUDA+Qt5 + opencv

    Nvidia driver installation(after download XX.run installation file) 1. ctrl+Alt+F1   //go to virtual ...

  5. 手把手教你搭建深度学习平台——避坑安装theano+CUDA

    python有多混乱我就不多说了.这个混论不仅是指整个python市场混乱,更混乱的还有python的各种附加依赖包.为了一劳永逸解决python的各种依赖包对深度学习造成的影响,本文中采用pytho ...

  6. [CUDA] CUDA to DL

    又是一枚祖国的骚年,阅览做做笔记:http://www.cnblogs.com/neopenx/p/4643705.html 这里只是一些基础知识.帮助理解DL tool的实现. “这也是深度学习带来 ...

  7. 基于Ubuntu14.04系统的nvidia tesla K40驱动和cuda 7.5安装笔记

    基于Ubuntu14.04系统的nvidia tesla K40驱动和cuda 7.5安装笔记 飞翔的蜘蛛人 注1:本人新手,文章中不准确的地方,欢迎批评指正 注2:知识储备应达到Linux入门级水平 ...

  8. CUDA程序设计(一)

    为什么需要GPU 几年前我启动并主导了一个项目,当时还在谷歌,这个项目叫谷歌大脑.该项目利用谷歌的计算基础设施来构建神经网络. 规模大概比之前的神经网络扩大了一百倍,我们的方法是用约一千台电脑.这确实 ...

  9. 使用 CUDA范例精解通用GPU编程 配套程序的方法

    用vs新建一个cuda的项目,然后将系统自动生成的那个.cu里头的内容,除了头文件引用外,全部替代成先有代码的内容. 然后程序就能跑了. 因为新建的是cuda的项目,所以所有的头文件和库的引用系统都会 ...

随机推荐

  1. QC的使用学习(一)

    今天学习的时间很少,就利用睡前的一点时间来学习一下刚安装好的QC. 1.后台站点管理.主要是对八大选项的了解: site project:顾名思义,就站点项目管理,管理域和项目. site user: ...

  2. 命令行编译 WPF

    在开发调试代码 WPF 时,经常需要在修改完成代码后,点击 Rebuild,然后到指定文件夹下点击打开对应的 .exe 验证程序是否正确, 可以通过以下命名实现修改程序后,点击一个 .bat 文件,直 ...

  3. 问题 C: Goldbach's Conjecture

    题目描述 Goldbach's Conjecture: For any even number n greater than or equal to 4, there exists at least ...

  4. Linux编译安装opencv

    参考https://blog.csdn.net/huang826336127/article/details/78760885 一.下载opencv源码包 下载地址:https://opencv.or ...

  5. spring mvc:实现给Controller函数传入map参数

    [1]前端js调用示例: ...fillOrDiffer?inMapJson={"2016-08-31 0:00:00":0.1,"2016-08-31 0:15:00& ...

  6. BZOJ 3670 NOI2014 动物园 KMP+dp

    题目链接:http://www.lydsy.com/JudgeOnline/problem.php?id=3670 题意概述:令num[i]表示字符串由1~i的字符形成的前缀中不相重叠的相同前后缀的数 ...

  7. [译]在SQL查询中如何映射(替换)查询的结果?

    问题来源: https://stackoverflow.com/questions/38567366/mapping-values-in-sql-select 有一个表格,就称它Plant,它有三列: ...

  8. BST插入与查找

    B树: 二叉查找树,所有左节点都比父节点要小,所有右节点都比父节点要大.查找,插入的时间复杂度为O(logn) public class BTreeTest { public static int[] ...

  9. 【转】webpack4

    1.不再支持node.js4.X 2.不能用webpack命令直接打包指定的文件,只能使用webpack.config.js进行配置. 即:webpack  demo01.js  bundle01.j ...

  10. 简单dp总结

    ### 简单dp总结 本文是阅读<挑战程序设计第二版>其中关于dp章节所作总结.将简要描述dp的部分知识. 一.dp是什么? dp在计算机专业学科中全称是动态规划(dynamic prog ...