OpenACC Julia 图形

▶ 书上的代码，逐步优化绘制 Julia 图形的代码

● 无并行优化（手动优化了变量等）

 #include <stdio.h>

 #include <stdlib.h>

 #include <openacc.h>

 #define N   (1024 * 8)

 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)// 计算单点迭代次数

 {

     float zre2 = 0.0f, zim2 = 0.0f;

     for (int iter = ; iter < maxIter; iter += )   // 一个迭代里计算两次

     {

         zre2 = zre * zre - zim * zim + cre, zim2 =  * zre * zim + cim;

         if (zre2 * zre2 + zim2 * zim2 > 4.0f)

             return iter;

         zre = zre2 * zre2 - zim2 * zim2 + cre, zim =  * zre2 * zim2 + cim;

         if (zre * zre + zim * zim > 4.0f)

             return iter;

     }

     return maxIter +  + (maxIter % );

 }

 int main()

 {

     const int maxIter = ;                                // 最大迭代次数

     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N; // 迭代常数和画幅步长

     int *image = (int *)malloc(sizeof(int) * N * N);

     FILE *pf = fopen("R:/output.txt", "w");

     for (int i = ; i < N; i++)

     {

         for (int j = ; j < N; j++)

             fprintf(pf, "%d ", julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter));

         fprintf(pf, "\n");

     }

     fclose(pf);

     free(image);

     //getchar();

     return ;

 }

● 输出结果（后面所有代码的输出都相同，不再写了）

● 改进 1，计算并行化

 #include <stdio.h>

 #include <stdlib.h>

 #include <openacc.h>

 #define N   (1024 * 8)

 #pragma acc routine seq

 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)

 {

     float zre2 = 0.0f, zim2 = 0.0f;

     for (int iter = ; iter < maxIter; iter += )

     {

         zre2 = zre * zre - zim * zim + cre, zim2 =  * zre * zim + cim;

         if (zre2 * zre2 + zim2 * zim2 > 4.0f)

             return iter;

         zre = zre2 * zre2 - zim2 * zim2 + cre, zim =  * zre2 * zim2 + cim;

         if (zre * zre + zim * zim > 4.0f)

             return iter;

     }

     return maxIter +  + (maxIter % );

 }

 int main()

 {

     const int maxIter = ;

     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;

     int *image = (int *)malloc(sizeof(int) * N * N);    

 #pragma acc data copyout(image[0:N * N])    // 数据域

     {

 #pragma acc kernels loop independent        // loop 并行化，强制独立

         for (int i = ; i < N; i++)

         {

             for (int j = ; j < N; j++)

                 image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);

         }

     }

     /*// 注释掉写入文件的部分，防止 Nvvp 加入分析

     FILE *pf = fopen("R:/output.txt", "w");

     for (int i = 0; i < N; i++)

     {

         for (int j = 0; j < N; j++)

             fprintf(pf, "%d ", image[i * N + j]);

         fprintf(pf, "\n");

     }

     fclose(pf);

     */

     free(image);

     //getchar();

     return ;

 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe

julia:

      , Generating acc routine seq

         Generating Tesla code

     , FMA (fused multiply-add) instruction(s) generated

     , FMA (fused multiply-add) instruction(s) generated

main:

     , Generating copyout(image[:])

     , Loop is parallelizable

         FMA (fused multiply-add) instruction(s) generated

     , Loop is parallelizable

         Accelerator kernel generated

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

PGI: "acc_shutdown" not detected, performance results might be incomplete.

 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data

D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c

  main  NVIDIA  devicenum=

    time(us): ,

    : data region reached  times

        : data copyout transfers:

             device time(us): total=, max=, min= avg=,

    : compute region reached  time

        : kernel launched  time

            grid: [256x128]  block: [32x4]

             device time(us): total= max= min= avg=

● 改进 2，分块计算，没有明显性能提升，为异步做准备

 #include <stdio.h>

 #include <stdlib.h>

 #include <openacc.h>

 #define N   (1024 * 8)

 #pragma acc routine seq

 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)

 {

     float zre2 = 0.0f, zim2 = 0.0f;

     for (int iter = ; iter < maxIter; iter += )

     {

         zre2 = zre * zre - zim * zim + cre, zim2 =  * zre * zim + cim;

         if (zre2 * zre2 + zim2 * zim2 > 4.0f)

             return iter;

         zre = zre2 * zre2 - zim2 * zim2 + cre, zim =  * zre2 * zim2 + cim;

         if (zre * zre + zim * zim > 4.0f)

             return iter;

     }

     return maxIter +  + (maxIter % );

 }

 int main()

 {

     const int maxIter = ;

     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;

     int *image = (int *)malloc(sizeof(int) * N * N);   

 #pragma acc data copyout(image[0:N * N])

     {

         const int numblock = ;                         // 指定分块数量

         for (int block = ; block < numblock; block++)  // 每次计算一块

         {

             const int start = block * (N / numblock), end = start + N / numblock;   // 每块的始末下标

 #pragma acc kernels loop independent

             for (int i = start; i < end; i++)

             {

                 for (int j = ; j < N; j++)

                     image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);

             }

         }

     }

     /*

     FILE *pf = fopen("R:/output.txt", "w");

     for (int i = 0; i < N; i++)

     {

         for (int j = 0; j < N; j++)

             fprintf(pf, "%d ", image[i * N + j]);

         fprintf(pf, "\n");

     }

     fclose(pf);

     */

     free(image);

     //getchar();

     return ;

 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe

julia:

      , Generating acc routine seq

         Generating Tesla code

     , FMA (fused multiply-add) instruction(s) generated

     , FMA (fused multiply-add) instruction(s) generated

main:

     , Generating copyout(image[:])

     , Loop is parallelizable

         FMA (fused multiply-add) instruction(s) generated

     , Loop is parallelizable

         Accelerator kernel generated

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

PGI: "acc_shutdown" not detected, performance results might be incomplete.

 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data

D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c

  main  NVIDIA  devicenum=

    time(us): ,

    : data region reached  times

        : data copyout transfers:

             device time(us): total=, max=, min= avg=,

    : compute region reached  times

        : kernel launched  times

            grid: [256x128]  block: [32x4]

             device time(us): total= max= min= avg=

● 改进 3，分块传输，没有明显性能提升，为异步做准备

 #include <stdio.h>

 #include <stdlib.h>

 #include <openacc.h>

 #define N   (1024 * 8)

 #pragma acc routine seq

 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)

 {

     float zre2 = 0.0f, zim2 = 0.0f;

     for (int iter = ; iter < maxIter; iter += )

     {

         zre2 = zre * zre - zim * zim + cre, zim2 =  * zre * zim + cim;

         if (zre2 * zre2 + zim2 * zim2 > 4.0f)

             return iter;

         zre = zre2 * zre2 - zim2 * zim2 + cre, zim =  * zre2 * zim2 + cim;

         if (zre * zre + zim * zim > 4.0f)

             return iter;

     }

     return maxIter +  + (maxIter % );

 }

 int main()

 {

     const int maxIter = ;

     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;

     int *image = (int *)malloc(sizeof(int) * N * N);

 #pragma acc data create(image[0:N * N])                         // 改成 create，不需要从主机拷贝初始数据

     {

         const int numBlock = , blockSize = N * N / numBlock;   // 仍然分块计算

         for (int block = ; block < numBlock; block++)

         {

             const int start = block * (N / numBlock), end = start + N / numBlock;

 #pragma acc kernels loop independent

             for (int i = start; i < end; i++)

             {

                 for (int j = ; j < N; j++)

                     image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);

             }

 #pragma acc update host(image[block * blockSize : blockSize])   // 每计算完一块就向主机回传数据

         }

     }

     /*

     FILE *pf = fopen("R:/output.txt", "w");

     for (int i = 0; i < N; i++)

     {

         for (int j = 0; j < N; j++)

             fprintf(pf, "%d ", image[i * N + j]);

         fprintf(pf, "\n");

     }

     fclose(pf);

     */

     free(image);

     //getchar();

     return ;

 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe

julia:

      , Generating acc routine seq

         Generating Tesla code

     , FMA (fused multiply-add) instruction(s) generated

     , FMA (fused multiply-add) instruction(s) generated

main:

     , Generating create(image[:])

     , Loop is parallelizable

         FMA (fused multiply-add) instruction(s) generated

     , Loop is parallelizable

         Accelerator kernel generated

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , FMA (fused multiply-add) instruction(s) generated

     , Generating update self(image[block*blockSize:blockSize])

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

PGI: "acc_shutdown" not detected, performance results might be incomplete.

 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data

D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c

  main  NVIDIA  devicenum=

    time(us): ,

    : data region reached  times

    : compute region reached  times

        : kernel launched  times

            grid: [256x128]  block: [32x4]

            elapsed time(us): total=, max=, min= avg=,

    : update directive reached  times

        : data copyout transfers:

             device time(us): total=, max=, min= avg=,

● 改进 4，异步计算 - 双向传输

 #include <stdio.h>

 #include <stdlib.h>

 #include <openacc.h>

 #define N   (1024 * 8)

 #pragma acc routine seq

 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)

 {

     float zre2 = 0.0f, zim2 = 0.0f;

     for (int iter = ; iter < maxIter; iter += )

     {

         zre2 = zre * zre - zim * zim + cre, zim2 =  * zre * zim + cim;

         if (zre2 * zre2 + zim2 * zim2 > 4.0f)

             return iter;

         zre = zre2 * zre2 - zim2 * zim2 + cre, zim =  * zre2 * zim2 + cim;

         if (zre * zre + zim * zim > 4.0f)

             return iter;

     }

     return maxIter +  + (maxIter % );

 }

 int main()

 {

     const int maxIter = ;

     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;

     int *image = (int *)malloc(sizeof(int) * N * N);    

 #pragma acc data create(image[0:N * N])

     {

         const int numBlock = , blockSize = N / numBlock * N;

         for (int block = ; block < numBlock; block++)

         {

             const int start = block * (N / numBlock), end = start + N / numBlock;

 #pragma acc kernels loop independent async(block + 1)                           // 异步计算，用块编号作标记

             for (int i = start; i < end; i++)

             {

                 for (int j = ; j < N; j++)

                     image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);

             }

 #pragma acc update host(image[block * blockSize : blockSize]) async(block + 1)  // 计算完一块就异步传输

         }

 #pragma acc wait

     }

     /*

     FILE *pf = fopen("R:/output.txt", "w");

     for (int i = 0; i < N; i++)

     {

         for (int j = 0; j < N; j++)

             fprintf(pf, "%d ", image[i * N + j]);

         fprintf(pf, "\n");

     }

     fclose(pf);

     */

     free(image);

     //getchar();

     return ;

 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe

julia:

      , Generating acc routine seq

         Generating Tesla code

     , FMA (fused multiply-add) instruction(s) generated

     , FMA (fused multiply-add) instruction(s) generated

main:

     , Generating create(image[:])

     , Loop is parallelizable

         FMA (fused multiply-add) instruction(s) generated

     , Loop is parallelizable

         Accelerator kernel generated

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , FMA (fused multiply-add) instruction(s) generated

     , Generating update self(image[block*blockSize:blockSize])

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= queue= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= queue= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= queue= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= queue= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

PGI: "acc_shutdown" not detected, performance results might be incomplete.

 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data

    Timing may be affected by asynchronous behavior

    set PGI_ACC_SYNCHRONOUS to  to disable async() clauses

D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c

  main  NVIDIA  devicenum=

    time(us): ,

    : data region reached  times

    : compute region reached  times

        : kernel launched  times

            grid: [256x128]  block: [32x4]

             device time(us): total= max= min= avg=

    : update directive reached  times

        : data copyout transfers:

             device time(us): total=, max=, min= avg=,

● 使用统一内存访址（Ubuntu，win64 不支持）

● 改进 4，多设备版本 1，使用 OpenMP

 #include <stdio.h>

 #include <stdlib.h>

 #include <omp.h>

 #include <openacc.h>

 #define N   (1024 * 8)

 #pragma acc routine seq

 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)

 {

     float zre2 = 0.0f, zim2 = 0.0f;

     for (int iter = ; iter < maxIter; iter += )

     {

         zre2 = zre * zre - zim * zim + cre, zim2 =  * zre * zim + cim;

         if (zre2 * zre2 + zim2 * zim2 > 4.0f)

             return iter;

         zre = zre2 * zre2 - zim2 * zim2 + cre, zim =  * zre2 * zim2 + cim;

         if (zre * zre + zim * zim > 4.0f)

             return iter;

     }

     return maxIter +  + (maxIter % );

 }

 int main()

 {

     const int maxIter = ;

     const int numBlock = acc_get_num_devices(acc_device_nvidia), blockSize = N / numBlock * N;  // 使用 OpenMP 检测目标设备数量，以此作为分块数

     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;

     int *image = (int *)malloc(sizeof(int) * N * N);

     acc_init(acc_device_nvidia);                                    // 一次性初始化全部目标设备

 #pragma omp parallel num_threads(numBlock)                          // 使用多个线程，分别向目标设备发送任务

     {

         acc_set_device_num(omp_get_thread_num(), acc_device_nvidia);// 标记目标设备

 #pragma omp for

         for (int block = ; block < numBlock; block++)

         {

             const int start = block * (N / numBlock), end = start + N / numBlock;

 #pragma acc data copyout(image[block * blockSize : blockSize])

             {

 #pragma acc kernels loop independent

                 for (int i = start; i < end; i++)

                 {

                     for (int j = ; j < N; j++)

                         image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);

                 }

             }

         }

     }

     /*

     FILE *pf = fopen("R:/output.txt", "w");

     for (int i = 0; i < N; i++)

     {

         for (int j = 0; j < N; j++)

             fprintf(pf, "%d ", image[i * N + j]);

         fprintf(pf, "\n");

     }

     fclose(pf);

     */

     free(image);

     //getchar();

     return ;

 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -mp -Minfo main.c -o main_acc.exe

julia:

     , Generating acc routine seq

         Generating Tesla code

     , FMA (fused multiply-add) instruction(s) generated

     , FMA (fused multiply-add) instruction(s) generated

main:

     , Parallel region activated

     , Parallel loop activated with static block schedule

     , Generating copyout(image[block*blockSize:blockSize])

     , Loop is parallelizable

         FMA (fused multiply-add) instruction(s) generated

     , Loop is parallelizable

         Accelerator kernel generated

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , FMA (fused multiply-add) instruction(s) generated

     , Barrier

     , Parallel region terminated

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

PGI: "acc_shutdown" not detected, performance results might be incomplete.

 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data

D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c

  main  NVIDIA  devicenum=

    time(us): ,

    : data region reached  times

        : data copyout transfers:

             device time(us): total=, max=, min= avg=,

    : compute region reached  time

        : kernel launched  time

            grid: [256x128]  block: [32x4]

             device time(us): total= max= min= avg=

● 改进 5，多设备版本 2，调整 OpenMP

 #include <stdio.h>

 #include <stdlib.h>

 #include <omp.h>

 #include <openacc.h>

 #define N   (1024 * 8)

 #pragma acc routine seq

 int julia(const float cre, const float cim, float zre, float zim, const int maxIter)

 {

     float zre2 = 0.0f, zim2 = 0.0f;

     for (int iter = ; iter < maxIter; iter += )

     {

         zre2 = zre * zre - zim * zim + cre, zim2 =  * zre * zim + cim;

         if (zre2 * zre2 + zim2 * zim2 > 4.0f)

             return iter;

         zre = zre2 * zre2 - zim2 * zim2 + cre, zim =  * zre2 * zim2 + cim;

         if (zre * zre + zim * zim > 4.0f)

             return iter;

     }

     return maxIter +  + (maxIter % );

 }

 int main()

 {

     const int maxIter = ;

     const int numBlock = acc_get_num_devices(acc_device_nvidia), blockSize = N / numBlock * N;

     const float cre = -0.8350, cim = -0.2321, h = 4.0f / N;   

     int *image = (int *)malloc(sizeof(int) * N * N);

     acc_init(acc_device_nvidia);

 #pragma omp parallel for num_threads(numBlock)          // 把函数 acc_set_device_num 单独放在一起

     for(int block = ;block<numBlock;block++)

         acc_set_device_num(block, acc_device_nvidia);

 #pragma omp for num_threads(numBlock)

     for (int block = ; block < numBlock; block++)

     {

         const int start = block * (N / numBlock), end = start + N / numBlock;

 #pragma acc data copyout(image[block * blockSize : blockSize])

         {

 #pragma acc kernels loop independent

             for (int i = start; i < end; i++)

             {

                 for (int j = ; j < N; j++)

                     image[i * N + j] = julia(cre, cim, i * h - 2.0f, j * h - 2.0f, maxIter);

             }

         }

     }

     /*

     FILE *pf = fopen("R:/output.txt", "w");

     for (int i = 0; i < N; i++)

     {

         for (int j = 0; j < N; j++)

             fprintf(pf, "%d ", image[i * N + j]);

         fprintf(pf, "\n");

     }

     fclose(pf);

     */

     free(image);

     //getchar();

     return ;

 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc main.c -acc -Minfo -o main_acc.exe

julia:

     , Generating acc routine seq

         Generating Tesla code

     , FMA (fused multiply-add) instruction(s) generated

     , FMA (fused multiply-add) instruction(s) generated

main:

     , Generating copyout(image[block*blockSize:blockSize])

     , Loop is parallelizable

         FMA (fused multiply-add) instruction(s) generated

     , Loop is parallelizable

         Accelerator kernel generated

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main

line= device= threadid= num_gangs= num_workers= vector_length= grid=256x128 block=32x4

PGI: "acc_shutdown" not detected, performance results might be incomplete.

 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data

D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c

  main  NVIDIA  devicenum=

    time(us): ,

    : data region reached  times

        : data copyout transfers:

             device time(us): total=, max=, min= avg=,

    : compute region reached  time

        : kernel launched  time

            grid: [256x128]  block: [32x4]

            elapsed time(us): total=, max=, min=, avg=,

OpenACC Julia 图形的更多相关文章

OpenCV绘制朱利亚(Julia)集合图形
朱利亚集合是一个在复平面上形成分形的点的集合.以法国数学家加斯顿·朱利亚(Gaston Julia)的名字命名. 朱利亚集合可以由下式进行反复迭代得到: 对于固定的复数c,取某一z值(如z = z0) ...
CUDA+OpenCV 绘制朱利亚(Julia)集合图形
Julia集中的元素都是经过简单的迭代计算得到的,很适合用CUDA进行加速.对一个600*600的图像,需要进行360000次迭代计算,所以在CUDA中创建了600*600个线程块(block),每个 ...
OpenACC 云水参数化方案
▶ 书上第十三章,用一系列步骤优化一个云水参数化方案.用于熟悉 Fortran 以及 OpenACC 在旗下的表现 ● 代码,文件较多,放在一起了 ! main.f90 PROGRAM main US ...
详解 CUDA By Example 中的 Julia Set 绘制GPU优化
笔者测试环境VS2019. 基本介绍原书作者引入Julia Sets意在使用GPU加速图形的绘制.Julia Set 是指满足下式迭代收敛的复数集合 \[ Z_{n+1}=Z_{n}^2+C \] ...
现代3D图形编程学习-基础简介(3)-什么是opengl (译)
本书系列现代3D图形编程学习 OpenGL是什么在我们编写openGL程序之前,我们首先需要知道什么是OpenGL. 将OpenGL作为一个API OpenGL 通常被认为是应用程序接口(API) ...
超全面的.NET GDI+图形图像编程教程
本篇主题内容是.NET GDI+图形图像编程系列的教程,不要被这个滚动条吓到,为了查找方便,我没有分开写,上面加了目录了,而且很多都是源码和图片~ (*^_^*) 本人也为了学习深刻,另一方面也是为了 ...
Ubuntu设置root用户登录图形界面
Ubuntu默认的是root用户不能登录图形界面的,只能以其他用户登录图形界面.这样就很麻烦,因为权限的问题,不能随意复制删除文件,用gedit编辑文件时经常不能保存,只能用vim去编辑. 解决的办法 ...
第六代智能英特尔® 酷睿™ 处理器图形 API 开发人员指南
欢迎查看第六代智能英特尔® 酷睿™ 处理器图形 API 开发人员指南,该处理器可为开发人员和最终用户提供领先的 CPU 和图形性能增强.各种新特性和功能以及显著提高的性能. 本指南旨在帮助软件开发人员 ...
通过Matrix进行二维图形仿射变换
Affine Transformation是一种二维坐标到二维坐标之间的线性变换,保持二维图形的"平直性"和"平行性".仿射变换可以通过一系列的原子变换的复合来 ...

随机推荐

IDEA中遇到One of the two will be used. Which one is undefined.
某次启动idea的时候看到控制台提示如下错误 : objc[]: Class JavaLaunchHelper is implemented .0_131.jdk/Contents/Home/bin/ ...
POJ3669解题报告（bfs）
POJ3669http://poj.org/problem?id=3669 很明显是一道bfs的题目由于陨石的降临具有时刻性,所以地图是随时间变化的, 所以可以使用结构体来存储陨石下落的时刻以及位置 ...
git操命令&&node操作命令
1:删除远程库中的分支(现在本地删除该分支) git push origin --delete trunk分支 2:初始化一个新的工程 jdf install init xxx 3:编译好代码后,交给 ...
Fiddler抓取https设置详解
很多使用fiddler抓包,对于http来说不需太多纠结,随便设置下就能用,但是抓取https就死活抓不了, 出现诸如以下问题: creation of the root certificate wa ...
nyoj 单调递增最长子序列
单调递增最长子序列时间限制:3000 ms | 内存限制:65535 KB 难度:4 描述求一个字符串的最长递增子序列的长度如:dabdbf最长递增子序列就是abdf,长度为4 输入 ...
[Python] 中文路径和中文文本文件乱码问题
情景: Python首先读取名为log.txt的文本文件, 其中包含有文件名相对路径信息filename. 随后Python调用shutil.copy2(src, dst)对该filename文件进行 ...
php生成迷宫和迷宫寻址算法实例
较之前的终于有所改善.生成迷宫的算法和寻址算法其实是一样.只是一个用了遍历一个用了递归.参考了网上的Mike Gold的算法. <?php //zairwolf z@cot8.com heade ...
O(n)线性空间的迷宫生成算法
之前所有的迷宫生成算法,空间都是O(mn),时间同样是O(mn),时间上已经不可能更优化, 于是,我就从空间优化上着手,研究一个仅用O(n)空间的生成算法. 我初步的想法是,每次生成一行,生成后立即输 ...
打开安装好的Microsoft Dynamics CRM 4.0 报错误为 Caller does not have enough privilege to set CallerOriginToken to the specified value 的解决办法
If you installed CRM 4.0 on box where you also have SQL and used a domain account as service account ...
【python】重定向输出
重定向的理解:就是把所要输出的结果输出到其他的地方.常用方法:"print >>",(若有其他方法后续补充) 举个例子: __author__ = 'paulwinfl ...

OpenACC Julia 图形

OpenACC Julia 图形的更多相关文章

随机推荐

热门专题