A tiny program to benchmark image transpose algorithms
Here is the code:
- #include <stdio.h>
- #include <xmmintrin.h>
- #include <windows.h>
- typedef __m128 Vec;
- typedef unsigned long long value_t;
- __forceinline value_t now()
- {
- QueryPerformanceCounter(&n);
- return n.QuadPart;
- }
- inline void img_transpose(
- Vec *dst_img,
- Vec *src_img,
- const int src_w,
- const int src_h)
- {
- #pragma omp parallel for
- for (int j = ; j < src_w; ++j)
- {
- for (int i = ; i < src_h; ++i)
- {
- dst_img[j * src_h + i] = src_img[i * src_w + j];
- }
- }
- }
- inline void img_transpose_block(
- Vec *dst_img,
- Vec *src_img,
- const int src_w,
- const int src_h)
- {
- #pragma omp parallel for
- for (int j = ; j < src_w; j += )
- {
- for (int i = ; i < src_h; i += )
- {
- const int nsize = min(j + , src_w);
- const int msize = min(i + , src_h);
- for (int n = j; n < nsize; ++n)
- {
- for (int m = i; m < msize; ++m)
- {
- dst_img[n * src_h + m] = src_img[m * src_w + n];
- }
- }
- }
- }
- }
- int main(int argc, char *argv[])
- {
- //// performance benchmark ////
- const int w = ;
- const int h = ;
- Vec *a = new Vec [w * h];
- Vec *b = new Vec [w * h];
- value_t start_time, end_time;
- QueryPerformanceFrequency(&freq);
- double ms_per_tick = 1000.0 / (double)freq.QuadPart;
- start_time = now();
- for (int t = ; t < ; ++t)
- {
- img_transpose(b, a, w, h);
- img_transpose(a, b, h, w);
- }
- end_time = now();
- printf("img_transpose: %f ms\n", (double)(end_time - start_time) * ms_per_tick);
- start_time = now();
- for (int t = ; t < ; ++t)
- {
- img_transpose_block(b, a, w, h);
- img_transpose_block(a, b, h, w);
- }
- end_time = now();
- printf("img_transpose_block: %f ms\n", (double)(end_time - start_time) * ms_per_tick);
- delete [] a;
- delete [] b;
- //// algorithm validation ////
- const int width = ;
- const int height = ;
- Vec *src_img = new Vec [width * height];
- Vec *dst_img = new Vec [height * width];
- for (int j = ; j < height; ++j)
- {
- for (int i = ; i < width; ++i)
- {
- src_img[j * width + i].m128_i32[] = i;
- src_img[j * width + i].m128_i32[] = j;
- }
- }
- img_transpose_block(dst_img, src_img, width, height);
- for (int j = ; j < width; ++j)
- {
- for (int i = ; i < height; ++i)
- {
- int pi = dst_img[j * height + i].m128_i32[];
- int pj = dst_img[j * height + i].m128_i32[];
- if (pi != j || pj != i)
- {
- printf("Algorithm is wrong!!!\n");
- }
- }
- }
- printf("All done\n");
- return ;
- }
