0_Simple__cdpSimplePrint + 0_Simple_

▶ CUDA 动态并行实现快排算法（单线程的递归调用）

▶ 源代码：动态并行递归调用线程块。要点：添加 -rdc=true 选项（生成 relocatable device code，相当于执行分离编译），以及链接库 cudadevrt.lib （用于动态并行，不同于运行时库 cudart.lib）

 #include <stdio.h>

 #include <cuda.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_cuda.h>

 #include <helper_string.h>

 __device__ int g_blockId = ;                                       // 线程块的全局编号，供所有线程读写

 __device__ void print_info(int depth, int blockId, int parent_threadId, int parent_blockId)  // 打印当前线程块信息，包括深度，当前块号，

 {

     if (threadIdx.x == )

     {

         if (depth == )

             printf("BLOCK %d launched by the host\n", blockId);

         else

         {

             char buffer[];

             for (int i = ; i < depth; ++i)                         // 对应更多层级，每层前面都有相应层数的 "|  "

             {

                 buffer[ * i + ] = '+';

                 buffer[ * i + ] = ' ';

                 buffer[ * i + ] = ' ';

             }

             buffer[ * depth] = '\0';

             printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, blockId, parent_threadId, parent_blockId);

         }

     }

     __syncthreads();

 }

 __global__ void cdp_kernel(int max_depth, int depth, int parent_threadId, int parent_blockId)// 线程块递归

 {

     __shared__ int s_blockId;                                       // 当前线程块的编号

     if (threadIdx.x == )                                           // 读取当前 g_blockId 到 s_blockId 中，并将 g_blockId 加一

         s_blockId = atomicAdd(&g_blockId, );

     __syncthreads();

     print_info(depth, s_blockId, parent_threadId, parent_blockId);  // 打印当前线程块信息，

     if (++depth >= max_depth)                                       // 达到最大递归深度则退出，否则继续调用 cdp_kernel()

         return;

     cdp_kernel << <gridDim.x, blockDim.x >> >(max_depth, depth, threadIdx.x, s_blockId);

 }

 int main(int argc, char **argv)

 {

     printf("CUDA Dynamic Parallelism\n");

     int max_depth = ;

     int device_count = , device = -;

     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h"))// 帮助模式

     {

         printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[]);

         exit(EXIT_SUCCESS);

     }

     if (checkCmdLineFlag(argc, (const char **)argv, "depth"))       // 手动设置递归深度

     {

         max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");

         if (max_depth <  || max_depth > )

         {

             printf("depth parameter has to be between 1 and 8\n");

             exit(EXIT_FAILURE);

         }

     }

     if (checkCmdLineFlag(argc, (const char **)argv, "device"))      // 命令行指定设备

     {

         device = getCmdLineArgumentInt(argc, (const char **)argv, "device");

         cudaDeviceProp properties;

         cudaGetDeviceProperties(&properties, device);

         if (properties.major >  || (properties.major ==  && properties.minor >= ))

             printf("Running on GPU %d (%s)\n", device, properties.name);

         else

         {

             printf("ERROR: required GPU with compute SM 3.5 or higher.\nCurrent GPU compute SM %d.%d\n", properties.major, properties.minor);

             exit(EXIT_FAILURE);

         }

     }

     else

     {

         cudaGetDeviceCount(&device_count);

         for (int i = ; i < device_count; ++i)

         {

             cudaDeviceProp properties;

             cudaGetDeviceProperties(&properties, i);

             if (properties.major >  || (properties.major ==  && properties.minor >= ))

             {

                 device = i;

                 printf("Running on GPU %d (%s)", i, properties.name);

                 break;

             }

             printf("Running on GPU %d (%s) does not support CUDA Dynamic Parallelism", i, properties.name);

         }

     }

     if (device == -)

     {

         printf("required GPU with compute SM 3.5 or higher.");

         exit(EXIT_WAIVED);

     }

     cudaSetDevice(device);

     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);

     printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n");

     cdp_kernel << <,  >> >(max_depth, , , -);

     cudaGetLastError();

     cudaDeviceSynchronize();

     getchar();

     exit(EXIT_SUCCESS);

 }

● 输出结果：主机调用 2 个线程块，每个线程块 2 个线程，每个线程按同样规模递归调用，共 2*4 个二级核函数，2*4*4 个三级核函数，一共 42 个线程块

CUDA Dynamic Parallelism

Running on GPU  (GeForce GTX )Launching cdp_kernel() with CUDA Dynamic Parallelism:

BLOCK  launched by the host

BLOCK  launched by the host

+  BLOCK  launched by thread  of block

+  BLOCK  launched by thread  of block

+  BLOCK  launched by thread  of block

+  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  BLOCK  launched by thread  of block

+  BLOCK  launched by thread  of block

+  BLOCK  launched by thread  of block

+  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

+  +  BLOCK  launched by thread  of block

▶ 涨姿势：

● 在核函数中递归地调用核函数，注意函数调用的格式

▶ 源代码：动态并行实现快排算法，输出结果只有 Finish!

 #include <stdio.h>

 #include <cuda.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_cuda.h>

 #include <helper_string.h>

 #define MAX_DEPTH       16

 #define INSERTION_SORT  32

 __device__ void selection_sort(unsigned int *data, int left, int right) //选择排序，单线程完成

 {

     for (int i = left; i <= right; ++i)

     {

         unsigned min_val = data[i];

         int min_idx = i;

         for (int j = i + ; j <= right; ++j)                            // 找最小元素及其下标

         {

             unsigned val_j = data[j];

             if (val_j < min_val)

             {

                 min_idx = j;

                 min_val = val_j;

             }

         }

         if (i != min_idx)                                               // 交换第 i 号元素到指定的位置上

         {

             data[min_idx] = data[i];

             data[i] = min_val;

         }

     }

 }

 __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)    // 快排主体，内含递归调用

 {

     if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT)           // 递归深度达到 MAX_DEPTH 或者 数组中元素个数不多于 INSERTION_SORT 时使用选排

     {

         selection_sort(data, left, right);

         return;

     }

     unsigned int *lptr = data + left, *rptr = data + right, pivot = data[(left + right) / ];

     while (lptr <= rptr)

     {

         unsigned int lval = *lptr, rval = *rptr;                        // 指定左指针指向的值和右指针指向的值

         while (lval < pivot)                                            // 递增左指针，等价于 lptr++; lval = *lptr;

             lval = *(++lptr);

         while (rval > pivot)                                            // 递减右指针

             rval = *(--rptr);

         if (lptr <= rptr)                                               // 交换左右指针指向的值

         {

             *lptr++ = rval;

             *rptr-- = lval;

         }

     }

     if (left < rptr - data)                                             // 将左右分区放到两个不同的流中

     {

         cudaStream_t s0;

         cudaStreamCreateWithFlags(&s0, cudaStreamNonBlocking);          // 指定该流不与 0 号流进行同步

         cdp_simple_quicksort << < , , , s0 >> >(data, left, rptr - data, depth + );

         cudaStreamDestroy(s0);

     }

     if (lptr - data < right)

     {

         cudaStream_t s1;

         cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);

         cdp_simple_quicksort << < , , , s1 >> >(data, lptr - data, right, depth + );

         cudaStreamDestroy(s1);

     }

 }

 void run_qsort(unsigned int *data, unsigned int n)                      // 快排入口

 {

     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH);        // 设置最大递归深度

     cdp_simple_quicksort << < ,  >> >(data, , n - , );

     cudaDeviceSynchronize();

 }

 int main(int argc, char **argv)

 {

     cudaSetDevice();

     const int n = ;

     unsigned int *h_data = (unsigned int *)malloc(sizeof(unsigned int) * n);

     srand();

     for (unsigned i = ; i < n; i++)

         h_data[i] = rand() % n;

     unsigned int *d_data;

     cudaMalloc((void **)&d_data, n * sizeof(unsigned int));

     cudaMemcpy(d_data, h_data, n * sizeof(unsigned int), cudaMemcpyHostToDevice);

     run_qsort(d_data, n);

     cudaMemcpy(h_data, d_data, n * sizeof(unsigned), cudaMemcpyDeviceToHost);

     for (int i = ; i < n; ++i)

     {

         if (h_data[i - ] > h_data[i])

         {

             printf("Error at i == %d, h_data[i-1] == %d, h_data[i] == %d\n", h_data[i - ], h_data[i]);

             break;

         }

     }

     printf("Finish!\n");

     free(h_data);

     cudaFree(d_data);

     getchar();

     exit(EXIT_SUCCESS);

 }

▶ 新姿势：

● checkCmdLineFlag 用于检验函数参数 argv 是否等于字符串 string_ref（定义于 helper_string.h 中）

 inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)

 {

     bool bFound = false;

     if (argc >= )

     {

         for (int i = ; i < argc; i++)

         {

             int string_start = stringRemoveDelimiter('-', argv[i]);

             const char *string_argv = &argv[i][string_start],

             const char*equal_pos = strchr(string_argv, '=');

             int argv_length = (int)(equal_pos ==  ? strlen(string_argv) : equal_pos - string_argv);

             int length = (int)strlen(string_ref);

             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))

             {

                 bFound = true;

                 continue;

             }

         }

     }

     return bFound;

 }

 inline int stringRemoveDelimiter(char delimiter, const char *string)    // 去除特定的符号，上述函数的中用于去除参数前面的 - 或 --

 {

     int string_start = ;

     while (string[string_start] == delimiter)

         string_start++;

     if (string_start >= (int)strlen(string) - )

         return ;

     return string_start;

 }

 #define STRNCASECMP _strnicmp                                           // 比较字符串（定义于string.h中）

 _ACRTIMP int __cdecl _strnicmp(_In_reads_or_z_(_MaxCount) char const* _String1, _In_reads_or_z_(_MaxCount) char const* _String2, _In_ size_t _MaxCount);

● getCmdLineArgumentInt 用于提取函数参数 argv 中的整数（定义于 helper_string.h 中）

 inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)

 {

     bool bFound = false;

     int value = -;

     if (argc >= )

     {

         for (int i = ; i < argc; i++)

         {

             int string_start = stringRemoveDelimiter('-', argv[i]);

             const char *string_argv = &argv[i][string_start];

             int length = (int)strlen(string_ref);

             if (!STRNCASECMP(string_argv, string_ref, length))

             {

                 if (length +  <= (int)strlen(string_argv))

                 {

                     int auto_inc = (string_argv[length] == '=') ?  : ;

                     value = atoi(&string_argv[length + auto_inc]);

                 }

                 else

                     value = ;

                 bFound = true;

                 continue;

             }

         }

     }

     if (bFound)

         return value;

     return ;

 }

● 指定最大递归深度

extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);

● 带有标识符的 cudaStreamCreateWithFlags ，设置流的优先级

extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);

■ 对比 cudaStreamCreate

extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);

0_Simple__cdpSimplePrint + 0_Simple__cdpSimpleQuicksort的更多相关文章

随机推荐

java GUI编程一
一.AWT介绍所有的可以显示出来的图形元素都称为Component,Component代表了所有的可见的图形元素,Component里面有一种比较特殊的图形元素叫Container,Containe ...
.NET Excel导出方法及其常见问题详解
摘要:.NET Excel导出方法及其常见问题详解. 一.Excel导出的实现方法在.net 程序开发中,对于Excel文件的导出我们一共有三种导出方式: 利用文件输出流进行读写操作这种方式的导出 ...
JS之脚本延迟
自从开了博客,我就一下班回来匆匆吃完饭门一关等一开电脑一打开匆匆的研究东西,以至于朋友们都怀疑我是不是都得了自闭症其实因为我有恐惧心理怕自己的技术哪天跟不上社会了,说到技术我觉得技术不求越新越好,但 ...
jenkins~集群分发功能和职责处理
jenkins的多节点集群在进行自动化部署时,你可以按着它们的项目类型去进行分别部署,这样即可以达到负载均衡,又可以达到一种职责的明确,比如像java的项目你可以使用linux服务来进行部署(拉代码 ...
移植u-boot-2012.04.01到JZ2440
开发环境:Ubuntu 12.04 开发板:JZ2440 256M NandFlash 64M SDRAM 交叉编译器:arm-linux-gcc-4.3.2 u-boot:u-boot-2012 ...
简单说明CGI是什么
html { font-family: sans-serif } body { margin: 0 } article,aside,details,figcaption,figure,footer,h ...
GCD hdu1695容斥原理
GCD Time Limit: 6000/3000 MS (Java/Others) Memory Limit: 32768/32768 K (Java/Others)Total Submiss ...
html基础知识笔记
HTML基础 1.1HTML文件的基本结构和W3C标准 1.1.1HTML简介 HTML是一种描述网页的语言,一种超文本标记的语言! 1.1.2HTML文件的基本结构头部(head) 头部是网页的标 ...
对Java Web项目中路径的理解
第一个:文件分隔符坑比Window.window分隔符用\;unix采用/.于是用File.separator来跨平台请注意:这是文件路径.在File f = new File(“c:\\hah ...
js 防止变量冲突
解决方法 sc1 和sc2 的a都是window.a 会指向一个变量 1:使用匿名函数将脚本包起来,全局变量用来通信如window.x; 2:命名空间代码: <script> (fun ...

0_Simple__cdpSimplePrint + 0_Simple__cdpSimpleQuicksort

0_Simple__cdpSimplePrint + 0_Simple__cdpSimpleQuicksort的更多相关文章

随机推荐

热门专题