0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc
介绍了线程束表决函数的实例(其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html),并在静态和运行时编译两种条件下进行使用。
▶ 源代码:静态
// simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} __global__ void vote3(bool *info, int warp_size)// 跨线程束检查
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );// 将每个线程指向等距间隔的元素,表明表决函数的运算结果可以进行分发 // 第一组 “下标模 3 得 0” 的元素为 0,第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素
*offs = __any_sync(mask, tx >= warp_size * / );
// 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0,第二组后半段和第三组 “下标模 3 得 1” 的元素为 1
*(offs + ) = (tx >= warp_size * / )? true: false;
// 第一组和第二组 “下标模 3 得 2” 的元素为 0,第三组 “下标模 3 得 2” 的元素为 1
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
// 最终结果应该是:
// 1 2 3 4 15 16 17 18 30 31 32
// 000 000 000 000 ... 000 000 000 000 ... 000 000 000
// 100 100 100 100 ... 100 100 110 110 ... 110 110 110
// 111 111 111 111 ... 111 111 111 111 ... 111 111 111
}
#endif
// simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h>
#include "simpleVote_kernel.cuh" #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组,size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
}
// 数组检查函数,type == 1:把数组元素全部加起来,结果非零就报错;type == 0:把数组元素全部加起来,结果不等于 WARP_SIZE 就报错
int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end-);
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type?(sum > ):(sum != WARP_SIZE);
} // 数组检查的中间函数,type == 1:使用(1,0,0,0)的模式调用数组检查函数;type == 0:使用(1,1,1,0)的模式调用数组检查函数
int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / )) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
unsigned int *d_input, *d_result;
bool *dinfo = NULL, *hinfo = NULL;
int error_count[] = { , , };
cudaSetDevice(); //使用长度为 4 个线程束的数组,刚好分为 4 个组(全零,后交替非零,前交替非零,全非零)进行表决
totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice); //测试一,any
printf("\n\tTest 1: ");
voteAny << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
voteAll << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三,使用长度为 9 个线程束的数组,但调用内核时只使用数量为 3 个线程束的线程,即分为 3 组,每组 WARP_SIZE * 3 个元素
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));
cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);
vote3 << <, totalElement / >> > (dinfo, WARP_SIZE);
cudaDeviceSynchronize();
cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cudaFree(d_input);
cudaFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cudaFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}
▶ 输出结果:
Start. Test : Passed Test : Passed Test : Passed Finish.
▶ 源代码:运行时编译(删掉了相同的注释)
// simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} extern "C" __global__ void vote3(bool *info, int warp_size)
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );
*offs = __any_sync(mask, tx >= warp_size * / );
*(offs + ) = (tx >= warp_size * / ) ? true : false;
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
}
#endif
// simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "nvrtc_helper.h"
#include <helper_functions.h> #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
} int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end - );
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type ? (sum > ) : (sum != WARP_SIZE);
} int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
CUdeviceptr d_input, d_result;// unsigned long long
bool *hinfo = NULL;
CUdeviceptr dinfo;
int error_count[] = { , , };
//cudaSetDevice(0); // 编译 PTX
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleVoteIntrinsics_nvrtc\\simpleVote_kernel.cuh";
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );// (1, NULL) 为主函数接受的参数个数和参数
CUmodule module = loadPTX(ptx, , NULL); totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));
cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int)); //测试一,any
printf("\n\tTest 1: ");
dim3 gridBlock(, );
dim3 threadBlock(totalElement, );
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "voteAny");
void *arr1[] = { (void *)&d_input, (void *)&d_result };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
cuModuleGetFunction(&kernel_addr, module, "voteAll");
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cuMemAlloc(&dinfo, totalElement * sizeof(bool));
cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));
threadBlock = dim3(totalElement / , ); // 改变线程块尺寸
cuModuleGetFunction(&kernel_addr, module, "vote3");
int size = WARP_SIZE;
void *arr2[] = { (void *)&dinfo, (void *)&size };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr2[], );
cuCtxSynchronize();
cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cuMemFree(d_input);
cuMemFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cuMemFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}
▶ 输出结果:
Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Test : Passed Test : Passed Test : Passed Finish.
▶ 涨姿势
● 线程表决函数见另一篇博客,注意 CUDA9.0 改进了部分函数,废弃了旧的部分函数。
0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章
随机推荐
- 《DSP using MATLAB》Problem 4.26
Y(z)部分分式展开, 零状态响应和零输入响应的部分分式展开,
- WebApi_基于Token的身份验证——JWT
JWT是啥? JWT就是一个字符串,经过加密处理与校验处理的字符串,形式为: A.B.C A由JWT头部信息header加密得到B由JWT用到的身份验证信息json数据加密得到C由A和B加密得到,是校 ...
- POJ2777(线段树裸题)
题目:http://poj.org/problem?id=2777 别忘了各地的return: 有可能输入的L<R,手动swap: 似乎是多组输入? pushup和pushdown的位置. (原 ...
- JZ2440 裸机驱动 第9章 中断体系结构
本章目标: 了解ARM体系CPU的7种工作模式 了解S3C2410/S3C2440中断体系结构 掌握S3C2410/S3C2440的中断服务程序的编写方法 9.1 S3C241 ...
- Windows Phone Runtime Component 中的类型转换
Windows Phone Runtime Component 是Windows Phone 平台用来写C++类库的项目类型. 主要目的是让C#和C++进行互操作,引用历史的C++代码,保护知识产权, ...
- MVC 自定义 以表达式树为参数的htmlhelper
public static MvcHtmlString Try<TModel, TProperty>( this HtmlHelper<TModel> htmlHelper, ...
- java 关于Java中静态代码块以及构造函数的执行先后顺序
先转个链接 http://blog.csdn.net/wangbaokangfei/article/details/7757320 另外补充一下.静态方法,只执行一次就自动在内存中存在,再次实例化此 ...
- Android自动化测试中AccessibilityService获取控件信息(1)
Android自动化测试中AccessibilityService获取控件信息(1) 分类: android自动化测试2014-03-24 15:31 3455人阅读 评论(16) 收藏 举报 and ...
- mysql binlog协议分析--具体event
这几天在修改canal, 连接mysql和maria接收到的event有所区别 拿一个简单的insert sql来举例 mysql 会有以下几个event写入到binlog里 1.ANONYMOUS_ ...
- 设置redis 密码
redis配置密码 1.通过配置文件进行配置 yum方式安装的redis配置文件通常在/etc/redis.conf中,打开配置文件找到 [plain] view plain copy require ...