int bw = blockDim.x;
int bh = blockDim.y; int tx = threadIdx.x%bw;
int ty = threadIdx.y%bh; __shared__ uchar2 ys0[][];
__shared__ uchar2 ys1[][];
__shared__ uchar2 uvs[][]; ys0[ty][tx] = y0y1;
ys1[ty][tx] = y2y3;
uvs[ty][tx] = uv; __syncthreads();
if (threadIdx.x == && threadIdx.y == ) {
for (int j = ; j != bh; ++j) {
uchar2* py0 = (uchar2*)(pDst + (iy + j) * * nPitch + ix * );
uchar2* py1 = (uchar2*)(pDst + ((iy + j) * +) * nPitch + ix * );
uchar2* puv = (uchar2*)(pDstUv + (iy + j)*nWidth + ix * );
for (int i = ; i != bw; ++i) {
*py0++ = ys0[j][i];
*py1++ = ys1[j][i];
//*puv++ = uvs[j][i];
} }


