0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics

介绍了线程束表决函数的实例（其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html），并在静态和运行时编译两种条件下进行使用。

▶ 源代码：静态

 // simpleVote_kernel.cuh

 #ifndef SIMPLEVOTE_KERNEL_CU

 #define SIMPLEVOTE_KERNEL_CU

 __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __any_sync(mask, input[tx]);

 }

 __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __all_sync(mask, input[tx]);

 }

 __global__ void vote3(bool *info, int warp_size)// 跨线程束检查

 {

     int tx = threadIdx.x;

     unsigned int mask = 0xffffffff;

     bool *offs = info + (tx * );// 将每个线程指向等距间隔的元素，表明表决函数的运算结果可以进行分发

     // 第一组 “下标模 3 得 0” 的元素为 0，第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素

     *offs = __any_sync(mask, tx >= warp_size *  / );

     // 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0，第二组后半段和第三组 “下标模 3 得 1” 的元素为 1

     *(offs + ) = (tx >= warp_size *  / )? true: false;

     // 第一组和第二组 “下标模 3 得 2” 的元素为 0，第三组 “下标模 3 得 2” 的元素为 1

     *(offs + ) = all(tx >= warp_size *  / ) ? true : false;

     // 最终结果应该是：

     //   1   2   3   4      15  16  17  18      30  31  32

     // 000 000 000 000 ... 000 000 000 000 ... 000 000 000

     // 100 100 100 100 ... 100 100 110 110 ... 110 110 110

     // 111 111 111 111 ... 111 111 111 111 ... 111 111 111

 }

 #endif

 // simpleVoteIntrinsics.cu

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_functions.h>

 #include <helper_cuda.h>

 #include "simpleVote_kernel.cuh"

 #define WARP_SIZE   32

 #define GROUP       4

 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组，size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}

 {

     for (int i = ; i < size / ; i++)

         VOTE_PATTERN[i] = 0x00000000; 

     for (int i = size / ; i < size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ? i : ; 

     for (int i = size / ; i <  * size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ?  : i; 

     for (int i =  * size / ; i < size; i++)

         VOTE_PATTERN[i] = 0xffffffff;

 }

 // 数组检查函数，type == 1：把数组元素全部加起来，结果非零就报错；type == 0：把数组元素全部加起来，结果不等于 WARP_SIZE 就报错

 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)

 {

     int i, sum;

     for (sum = , i = start; i < end; i++)

         sum += h_result[i];

     if (type&&sum >  || !type&& sum != WARP_SIZE)

     {

         printf("\n\t<%s>[%d - %d]:", name, start, end-);

         for (i = start; i < end; i++)

             printf("%d,", h_result[i]);

         printf("\b\n");

     }

     return type?(sum > ):(sum != WARP_SIZE);

 }

 // 数组检查的中间函数，type == 1：使用(1,0,0,0)的模式调用数组检查函数；type == 0：使用(1,1,1,0)的模式调用数组检查函数

 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)

 {

     int error_count = ;

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int checkResultsVoteKernel(bool *hinfo, int totalThread)

 {

     int i, error_count;

     for (i = error_count = ; i < totalThread * ; i++)

     {

         switch (i % )

         {

             case :

                 if (hinfo[i] != (i >= totalThread * ))     // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)

                     error_count++;

                 break;

             case :

                 if (hinfo[i] != (i >= totalThread *  / )) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)

                     error_count++;

                 break;

             case :

                 if (hinfo[i] != (i >= totalThread * ))     // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)

                     error_count++;

                 break;

         }

     }

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int main()

 {

     printf("\n\tStart.\n");

     int totalElement;

     unsigned int *h_input, *h_result;

     unsigned int *d_input, *d_result;

     bool *dinfo = NULL, *hinfo = NULL;

     int error_count[] = { , ,  };

     cudaSetDevice();

     //使用长度为 4 个线程束的数组，刚好分为 4 个组（全零，后交替非零，前交替非零，全非零）进行表决

     totalElement = WARP_SIZE * GROUP;

     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int));

     cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));

     genVoteTestPattern(h_input, totalElement);

     cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice);

     //测试一，any

     printf("\n\tTest 1: ");

     voteAny << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);

     cudaDeviceSynchronize();

     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试二，all

     printf("\n\tTest 2: ");

     voteAll << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);

     cudaDeviceSynchronize();

     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试三，使用长度为 9 个线程束的数组，但调用内核时只使用数量为 3 个线程束的线程，即分为 3 组，每组 WARP_SIZE * 3 个元素

     printf("\n\tTest 3: ");

     totalElement = WARP_SIZE *  * ;

     hinfo = (bool *)calloc(totalElement, sizeof(bool));

     cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));

     cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);

     vote3 << <, totalElement /  >> > (dinfo, WARP_SIZE);

     cudaDeviceSynchronize();

     cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);

     error_count[] = checkResultsVoteKernel(hinfo, totalElement / );

     // 清理工作

     cudaFree(d_input);

     cudaFree(d_result);

     free(h_input);

     free(h_result);

     free(hinfo);

     cudaFree(dinfo);

     printf("\t\nFinish.\n");

     getchar();

     return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;

 }

▶ 输出结果：

    Start.

    Test : Passed

    Test : Passed

    Test : Passed

    Finish.

▶ 源代码：运行时编译（删掉了相同的注释）

 // simpleVote_kernel.cuh

 #ifndef SIMPLEVOTE_KERNEL_CU

 #define SIMPLEVOTE_KERNEL_CU

 extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __any_sync(mask, input[tx]);

 }

 extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __all_sync(mask, input[tx]);

 }

 extern "C" __global__ void vote3(bool *info, int warp_size)

 {

     int tx = threadIdx.x;

     unsigned int mask = 0xffffffff;

     bool *offs = info + (tx * );

     *offs = __any_sync(mask, tx >= warp_size *  / );

     *(offs + ) = (tx >= warp_size *  / ) ? true : false;

     *(offs + ) = all(tx >= warp_size *  / ) ? true : false;

 }

 #endif

 // simpleVoteIntrinsics.cu

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include "nvrtc_helper.h"

 #include <helper_functions.h>

 #define WARP_SIZE   32

 #define GROUP       4

 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)

 {

     for (int i = ; i < size / ; i++)

         VOTE_PATTERN[i] = 0x00000000;

     for (int i = size / ; i < size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ? i : ;

     for (int i = size / ; i <  * size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ?  : i;

     for (int i =  * size / ; i < size; i++)

         VOTE_PATTERN[i] = 0xffffffff;

 }

 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)

 {

     int i, sum;

     for (sum = , i = start; i < end; i++)

         sum += h_result[i];

     if (type&&sum >  || !type&& sum != WARP_SIZE)

     {

         printf("\n\t<%s>[%d - %d]:", name, start, end - );

         for (i = start; i < end; i++)

             printf("%d,", h_result[i]);

         printf("\b\n");

     }

     return type ? (sum > ) : (sum != WARP_SIZE);

 }

 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)

 {

     int error_count = ;

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int checkResultsVoteKernel(bool *hinfo, int totalThread)

 {

     int i, error_count;

     for (i = error_count = ; i < totalThread * ; i++)

     {

         switch (i % )

         {

         case :

             if (hinfo[i] != (i >= totalThread * ))

                 error_count++;

             break;

         case :

             if (hinfo[i] != (i >= totalThread *  / ))

                 error_count++;

             break;

         case :

             if (hinfo[i] != (i >= totalThread * ))

                 error_count++;

             break;

         }

     }

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int main()

 {

     printf("\n\tStart.\n");

     int totalElement;

     unsigned int *h_input, *h_result;

     CUdeviceptr d_input, d_result;// unsigned long long

     bool *hinfo = NULL;

     CUdeviceptr dinfo;

     int error_count[] = { , ,  };

     //cudaSetDevice(0); 

     // 编译 PTX

     char *ptx, *kernel_file;

     size_t ptxSize;

     kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleVoteIntrinsics_nvrtc\\simpleVote_kernel.cuh";

     compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );// (1, NULL) 为主函数接受的参数个数和参数

     CUmodule module = loadPTX(ptx, , NULL);

     totalElement = WARP_SIZE * GROUP;

     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));

     cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));

     genVoteTestPattern(h_input, totalElement);

     cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int));

     //测试一，any

     printf("\n\tTest 1: ");

     dim3 gridBlock(, );

     dim3 threadBlock(totalElement, );

     CUfunction kernel_addr;

     cuModuleGetFunction(&kernel_addr, module, "voteAny");

     void *arr1[] = { (void *)&d_input, (void *)&d_result };

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试二，all

     printf("\n\tTest 2: ");

     cuModuleGetFunction(&kernel_addr, module, "voteAll");

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试三

     printf("\n\tTest 3: ");

     totalElement = WARP_SIZE *  * ;

     hinfo = (bool *)calloc(totalElement, sizeof(bool));

     cuMemAlloc(&dinfo, totalElement * sizeof(bool));

     cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));

     threadBlock = dim3(totalElement / , );                    // 改变线程块尺寸

     cuModuleGetFunction(&kernel_addr, module, "vote3");

     int size = WARP_SIZE;

     void *arr2[] = { (void *)&dinfo, (void *)&size };

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr2[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));

     error_count[] = checkResultsVoteKernel(hinfo, totalElement / );

     // 清理工作

     cuMemFree(d_input);

     cuMemFree(d_result);

     free(h_input);

     free(h_result);

     free(hinfo);

     cuMemFree(dinfo);

     printf("\t\nFinish.\n");

     getchar();

     return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;

 }

▶ 输出结果：

        Start.

> Using CUDA Device []: GeForce GTX

> GPU Device has SM 6.1 compute capability

        Test : Passed

        Test : Passed

        Test : Passed

Finish.

▶ 涨姿势

● 线程表决函数见另一篇博客，注意 CUDA9.0 改进了部分函数，废弃了旧的部分函数。

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章

随机推荐

迭代器Iterator的底层实现原理
第一步:没有接口的迭代器简单实现原理 package com.bjsxt.xiaofei; /** * 迭代器底层原理 * 方法: * hasNext() * next() * remove() * ...
部署tomcat到Linux
1. alt+p 放文件 2.解压到自定义 apps文件夹中 tar -zxvf apache-tomcat-7.0.68.tar.gz -C apps 3.进入文件启动tomcat/bin ./ ...
wpf的datepicker处理（转）
如果有2个datepicker,控制时间起和止的话,可以把第二个datepicker加一个属性,DisplayDateStart = "{Binding SelectedDate,Eleme ...
看懂Class文件的装载流程
Class文件的加载过程 ClassLoader的工作模式类的热加载 1 Class文件的装载流程只有被java虚拟机装载的Class类型才能在程序中使用(注意装载和加载的区别) 1.1 类装载的 ...
FastAdmin 离线安装 ueditor 出现 rule 错误
使用的是 phpStudy 的 nginx + php5.6 离线安装 ueditor.zip 出现,安装其它的插件没有问题. Call to a member function rule() on ...
【转】每天一个linux命令（60）：scp命令
原文网址:http://www.cnblogs.com/peida/archive/2013/03/15/2960802.html scp是secure copy的简写,用于在Linux下进行远程拷贝 ...
MySQL中UNSIGNED和ZEROFILL的介绍
UNSIGNED: 将数字类型无符号化,这与C和C++这些程序语言的unsigned含义相同. INT的类型范围-2 147 483 648~2 147 483 647 INT UNSIGNED范围0 ...
Linux C程序操作Mysql 调用PHP采集淘宝商品 (转)
还是继续这个项目. 在上一篇Linux下利用Shell使PHP并发采集淘宝产品中,采用shell将对PHP的调用推到后台执行,模拟多线程. 此方法有一致命缺点,只能人工预判每个程序执行时间.如果判断时 ...
opencv中的缩放函数
图像处理里面缩放操作是比较常见的: 最近邻插值:类似简单映射的处理方式,目标图像w1,h1,原始图像w0,h0,则在目标图像上的点(x,y)的像素点实际对应原始图上(x*w0/w1,y*h0/h1)的 ...
CMSIS DSP Lib：RFFT函数的bug
调用arm_rfft_fast_init_f32之后,还有两个成员需要初始化: arm_rfft_fast_instance_f32::pTwiddleRFFT = twiddleCoef_rfft_ ...

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章

随机推荐

热门专题