▶ 协作组,CUDA9.0 的新特性

▶ 源代码,如何获得协作组的编号?

 #include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cooperative_groups.h> #define THREAD_PER_BLOCK 64 using namespace cooperative_groups; // 注意使用命名空间 __device__ int sumReduction(thread_group g, int *x, int val) // 规约设备函数,要求共享内存 int *x 要够放得下 g.size() 个参加规约的元素
{
int lane = g.thread_rank(); // 线程在协作组中的编号,教程中名字就叫 line ID for (int i = g.size() / ; i > ; i /= )
{
x[lane] = val; // 第一次迭代该步相当于初始化,以后迭代该步相当于存储上一次迭代的结果
g.sync(); // 协作组同步
if (lane < i)
val += x[lane + i]; // 利用每个线程局部变量 val 记录当前结果
g.sync();
}
if (g.thread_rank() == ) // 零号线程返回计算结果
return val;
else
return -;
} __global__ void cgkernel()
{
extern __shared__ int workspace[]; thread_block group = this_thread_block(); // 将线程块内所有线程打包为一个协作组
int groupSize = group.size(); // 获得协作组大小(线程个数)
int input = group.thread_rank(); // 获得线程在协作组内的编号,并作为计算输入
int output = sumReduction(group, workspace, input); // 规约计算,注意直接使用共享内存作为工作空间
int expectedOutput = (groupSize - )*groupSize / ; // 预期计算结果,0 + 1 + 2 +...+ 63 = 2016 if (group.thread_rank() == ) // 0 号线程报告计算结果,宣布开始新的 4 个协作组的计算任务
{
printf("\n\tSum of thread 0 ~ %d in group is %d (expected %d)\n", group.size() - , output, expectedOutput);
printf("\n\tNow creating %d groups, each of size 16 threads:\n", group.size() / );
}
group.sync(); // 协作组同步 thread_block_tile<> group16 = tiled_partition<>(group); // 每16个线程分割为一个协作组(只能使用 2 的整数次幂) int offset = group.thread_rank() - group16.thread_rank(); // 各协作组使用的共享内存的地址偏移量
printf("%d -> thread_rank = %d, group16.thread_rank = %d, offset = %d\n", threadIdx.x, group.thread_rank(), group16.thread_rank(), offset);
// dim3 group.group_index() 打印出来全是 (0, 0, 0),dim3 group.thread_index() 打印出来跟 group.thread_rank() 一样 input = group16.thread_rank(); // 获得线程在新协作组中的编号,并作为计算输入
output = sumReduction(group16, workspace + offset, input); // 规约计算,注意工作空间的地址偏移
expectedOutput = * / ; // 预期计算结果,0 + 1 + 2 +...+ 16 = 120 if (group16.thread_rank() == ) // 各协作组零号线程报告计算结果
printf("\n\tSum of all ranks 0..15 in group16 is %d (expected %d)\n", output, expectedOutput);
return;
} int main()
{
printf("\n\tStart with %d threads.\n", THREAD_PER_BLOCK); cgkernel << <, THREAD_PER_BLOCK, THREAD_PER_BLOCK * sizeof(int) >> > ();
cudaDeviceSynchronize(); printf("\n\tFinish.\n");
getchar();
return ;
}

● 输出结果

        Start with  threads.

        Sum of thread  ~  in group is  (expected )

        Now creating  groups, each of size  threads:
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset =
-> thread_rank = , group16.thread_rank = , offset = Sum of all ranks .. in group16 is (expected ) Sum of all ranks .. in group16 is (expected ) Sum of all ranks .. in group16 is (expected ) Sum of all ranks .. in group16 is (expected ) Finish.

▶ 涨姿势:

● 相关定义

 // cooperative_groups_helper.h
# if !defined(_CG_QUALIFIER)
# define _CG_QUALIFIER __forceinline__ __device__
# endif # define die() assert(); // cooperative_groups.h(调整顺序)
class thread_group // 通用线程组类型
{
friend _CG_QUALIFIER thread_group this_thread();
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
friend class thread_block; protected:
union __align__()
{
unsigned int type : ;
struct
{
unsigned int type : ;
unsigned int size : ;
unsigned int mask;
} coalesced;
struct
{
void* ptr[];
} buffer;
} _data; _CG_QUALIFIER thread_group operator=(const thread_group& src); _CG_QUALIFIER thread_group(__internal::groupType type)
{
_data.type = type;
}
#if __cplusplus >= 201103L
static_assert(sizeof(_data) == , "Failed size check");
#endif public:
_CG_QUALIFIER unsigned int size() const;
_CG_QUALIFIER unsigned int thread_rank() const;
_CG_QUALIFIER void sync() const;
}; class thread_block : public thread_group
{
friend _CG_QUALIFIER thread_block this_thread_block();
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz); _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) {} _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const
{
const bool pow2_tilesz = ((tilesz & (tilesz - )) == ); if (tilesz == || (tilesz > ) || !pow2_tilesz)
{
die();
return (thread_block());
} unsigned int mask;
unsigned int base_offset = thread_rank() & (~(tilesz - ));
unsigned int masklength = min(size() - base_offset, tilesz);
mask = (unsigned int)(-) >> ( - masklength);
mask <<= (__internal::laneid() & ~(tilesz - ));
thread_group tile = thread_group(__internal::CoalescedTile);
tile._data.coalesced.mask = mask;
tile._data.coalesced.size = __popc(mask);
return (tile);
} public:
_CG_QUALIFIER void sync() const { __internal::cta::sync(); }
_CG_QUALIFIER unsigned int size() const { return (__internal::cta::size()); }
_CG_QUALIFIER unsigned int thread_rank() const { return (__internal::cta::thread_rank()); }
_CG_QUALIFIER dim3 group_index() const { return (__internal::cta::group_index()); }
_CG_QUALIFIER dim3 thread_index() const { return (__internal::cta::thread_index()); }
}; _CG_QUALIFIER thread_block this_thread_block()// 范例代码中用到的,实际是调用了 thread_block 的构造函数
{
return (thread_block());
} template <unsigned int Size>
class thread_block_tile;
template <> class thread_block_tile<> : public __thread_block_tile_base<> { };
template <> class thread_block_tile<> : public __thread_block_tile_base<> { };
template <> class thread_block_tile<> : public __thread_block_tile_base<> { };
template <> class thread_block_tile<> : public __thread_block_tile_base<> { };
template <> class thread_block_tile<> : public __thread_block_tile_base<> { };
template <> class thread_block_tile<> : public __thread_block_tile_base<> { }; template <unsigned int Size>
class __thread_block_tile_base : public thread_group
{
static const unsigned int numThreads = Size;
_CG_QUALIFIER unsigned int build_mask() const
{
unsigned int mask;
if (numThreads == )
mask = 0xFFFFFFFF;
else
{
mask = (unsigned int)(-) >> ( - numThreads);
mask <<= (__internal::laneid() & (~(numThreads - )));
}
return (mask);
} protected:
_CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile)
{
_data.coalesced.mask = build_mask();
_data.coalesced.size = numThreads;
} public:
_CG_QUALIFIER void sync() const { __syncwarp(build_mask()); }
_CG_QUALIFIER unsigned int thread_rank() const { return (threadIdx.x & (numThreads - )); }
_CG_QUALIFIER unsigned int size() const { return (numThreads); } // PTX supported collectives
_CG_QUALIFIER int shfl(int var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); }
... #ifdef _CG_HAS_FP16_COLLECTIVE
_CG_QUALIFIER __half shfl(__half var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); }
... #endif #ifdef _CG_HAS_MATCH_COLLECTIVE
_CG_QUALIFIER unsigned int match_any(int val) const
{
unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
return (lane_match >> (__internal::laneid() & (~(numThreads - ))));
}
...
#endif
};

● 用到的线程协作相关函数

 thread_block threadBlockGroup = this_thread_block();    // 将当前线程块分配为一个协作组

 thread_block_tile<> tiledPartition16 = tiled_partition<>(threadBlockGroup); // 协作组分组

 int in = tiledPartition16.thread_rank();                // 协作组中线程的编号

 tiledPartition16.sync();                            // 协作组同步

0_Simple__simpleCooperativeGroups的更多相关文章

随机推荐

  1. 定义一组抽象的 Awaiter 的实现接口,你下次写自己的 await 可等待对象时将更加方便

    我在几篇文章中都说到了在 .NET 中自己实现 Awaiter 情况.async / await 写异步代码用起来真的很爽,就像写同步一样.然而实现 Awaiter 没有现成的接口,它需要你按照编译器 ...

  2. POJ 3253 Fence Repair STL 优先队列

    这题做完后觉得很水,主要的想法就是逆过程思考,原题是截断,可以想成是拼装,一共有n根木棍,最后要拼成一根完整的,每两根小的拼成一根大的,拼成后的木棍长度就是费用,要求费用最少.显然的是一共会拼接n-1 ...

  3. 【分形】【洛谷P1498】

    https://www.luogu.org/problemnew/show/P1498 题目描述 自从到了南蛮之地,孔明不仅把孟获收拾的服服帖帖,而且还发现了不少少数民族的智慧,他发现少数民族的图腾往 ...

  4. 关于Hibernate性能优化之 FetchType=Lazy时查询数据

    当表A和表B一对多的关系 对于A和B的实体类,设置FetchType=EAGER时,取A表数据,对应B表的数据都会跟着一起加载,优点不用进行二次查询.缺点是严重影响数据查询的访问时间. 解决办法Fet ...

  5. jquery中的 .parent()

    ☆ 遍历 - .parent() 方法: 查找每个段落的带有 "selected" 类的父元素: <body> <ul class="level-1&q ...

  6. Django的DateTimeField和DateField

    一.DateField: class DateField(auto_now=False, auto_now_add=False, **options)[source] auto_now:每次保存时,都 ...

  7. Oracle 11gR2 RAC 常用维护操作 说明

    一.启动和停止集群 在Oracle 11gR2 下的RAC,架构发生了变化.CRS的信息也是放在ASM 实例里的,所以要关asm,必须关闭crs, 如果还使用了acfs的话,一关crs那么acfs里的 ...

  8. Angular 4 管道

    一.date管道 1.html 2. 控制器中的定义brithday 3.效果图 如果时间格式 为: 我的生日是{{birthday | date:'yyyy-MM-dd HH:mm:ss'}} 则效 ...

  9. c#操作xml的代码(插入节点、修改节点、删除节点等)

    bookstore.xml文件内容: 复制代码代码示例: <?xml version="1.0" encoding="gb2312"?><bo ...

  10. Modbus tcp 格式说明 通讯机制 附C#测试工具用于学习,测试

    前言: 之前的博客介绍了如何用C#来读写modbus tcp服务器的数据,文章:http://www.cnblogs.com/dathlin/p/7885368.html 当然也有如何创建一个服务器文 ...