▶ 使用 cuda 内置结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量

▶ 源代码

 // cppOverload_kernel.cu
__global__ void simple_kernel(const int *pIn, int *pOut, int a)
{
__shared__ int sData[THREAD_N];
int tid = threadIdx.x + blockDim.x * blockIdx.x; sData[threadIdx.x] = pIn[tid];
__syncthreads();
pOut[tid] = sData[threadIdx.x] * a + tid;
} __global__ void simple_kernel(const int2 *pIn, int *pOut, int a)
{
__shared__ int2 sData[THREAD_N];
int tid = threadIdx.x + blockDim.x * blockIdx.x; sData[threadIdx.x] = pIn[tid];
__syncthreads();
pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid;
} __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a)
{
__shared__ int sData1[THREAD_N], sData2[THREAD_N];
int tid = threadIdx.x + blockDim.x * blockIdx.x; sData1[threadIdx.x] = pIn1[tid];
sData2[threadIdx.x] = pIn2[tid];
__syncthreads();
pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid;
}
 // cppOverload.cu
#include <stdio.h>
#include <helper_cuda.h>
#include <helper_math.h>
#include <helper_string.h> #define THREAD_N 256
#include "cppOverload_kernel.cu" // 源代码文件中使用了 THREAD_N,必须先定义 #define N 1024
#define DIV_UP(a, b) (((a) + (b) - 1) / (b))
#define OUTPUT_ATTR(attr) \
printf("Shared Size: %d\n", (int)attr.sharedSizeBytes); \
printf("Constant Size: %d\n", (int)attr.constSizeBytes); \
printf("Local Size: %d\n", (int)attr.localSizeBytes); \
printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \
printf("Number of Registers: %d\n", attr.numRegs); \
printf("PTX Version: %d\n", attr.ptxVersion); \
printf("Binary Version: %d\n", attr.binaryVersion); bool check_func1(int *hInput, int *hOutput, int a)
{
for (int i = ; i < N; ++i)
{
int cpuRes = hInput[i] * a + i;
if (hOutput[i] != cpuRes)
return false;
}
return true;
} bool check_func2(int2 *hInput, int *hOutput, int a)
{
for (int i = ; i < N; i++)
{
int cpuRes = (hInput[i].x + hInput[i].y)*a + i;
if (hOutput[i] != cpuRes)
return false;
}
return true;
} bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a)
{
for (int i = ; i < N; i++)
{
if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i)
return false;
}
return true;
} int main(int argc, const char *argv[])
{
int deviceID = cudaSetDevice(); int *hInput = NULL, *hOutput = NULL, *dInput = NULL, *dOutput = NULL;
cudaMalloc(&dInput, sizeof(int)*N * );
cudaMalloc(&dOutput, sizeof(int)*N);
cudaMallocHost(&hInput, sizeof(int)*N * );
cudaMallocHost(&hOutput, sizeof(int)*N); for (int i = ; i < N * ; i++)
hInput[i] = i;
cudaMemcpy(dInput, hInput, sizeof(int)*N * , cudaMemcpyHostToDevice); const int a = ;
void(*func1)(const int *, int *, int) = simple_kernel;
void(*func2)(const int2 *, int *, int) = simple_kernel;
void(*func3)(const int *, const int *, int *, int) = simple_kernel;
struct cudaFuncAttributes attr; // function 1
memset(&attr, , sizeof(attr));
cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared); // 运行前分析资源占用
cudaFuncGetAttributes(&attr, *func1);
OUTPUT_ATTR(attr);
(*func1) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dOutput, a);
cudaDeviceSynchronize();
cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n", check_func1(hInput, hOutput, a) ? "PASSED" : "FAILED"); // function 2
memset(&attr, , sizeof(attr));
cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared);
cudaFuncGetAttributes(&attr, *func2);
OUTPUT_ATTR(attr);
(*func2) << <DIV_UP(N, THREAD_N), THREAD_N >> >((int2 *)dInput, dOutput, a); // 强行转换成 int2*,反正也是对其的
cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n", check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a) ? "PASSED" : "FAILED"); // function 3
memset(&attr, , sizeof(attr));
cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared);
cudaFuncGetAttributes(&attr, *func3);
OUTPUT_ATTR(attr);
(*func3) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dInput + N, dOutput, a);
cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s\n\n", check_func3(&hInput[], &hInput[N], hOutput, a) ? "PASSED" : "FAILED"); cudaFree(dInput);
cudaFree(dOutput);
cudaFreeHost(hOutput);
cudaFreeHost(hInput);
getchar();
return ;
}

● 输出结果:

Shared Size:
Constant Size:
Local Size:
Max Threads Per Block:
Number of Registers:
PTX Version:
Binary Version:
simple_kernel(const int *pIn, int *pOut, int a) PASSED Shared Size:
Constant Size:
Local Size:
Max Threads Per Block:
Number of Registers:
PTX Version:
Binary Version:
simple_kernel(const int2 *pIn, int *pOut, int a) PASSED Shared Size:
Constant Size:
Local Size:
Max Threads Per Block:
Number of Registers:
PTX Version:
Binary Version:
simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED

▶ 涨姿势:

● cuda 使用扩展名为 .cuh 的头文件

● cuda内置结构 cudaFuncAttributes 的定义:

 struct __device_builtin__ cudaFuncAttributes
{
size_t sharedSizeBytes; // 共享内存大小
size_t constSizeBytees; // 常量内存大小
size_t localSizeBytes; // 局部内存大小
int maxThreadsPerBlock; // 每线程块线最大程数量
int numRegs; // 寄存器数量
int ptxVersion; // PTX版本号
int binaryVersion; // 机器码版本号
int cacheModeCA; // 是否使用编译指令 -Xptxas --dlcm=ca
};

● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量

 struct cudaFuncAttributes attr;
memset(&attr, , sizeof(attr));
cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared);
cudaFuncGetAttributes(&attr, *function);

■ 涉及的函数

 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);

 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
{
return cudaErrorUnknown;
} #define OUTPUT_ATTR(attr) \
printf("Shared Size: %d\n", (int)attr.sharedSizeBytes); \
printf("Constant Size: %d\n", (int)attr.constSizeBytes); \
printf("Local Size: %d\n", (int)attr.localSizeBytes); \
printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \
printf("Number of Registers: %d\n", attr.numRegs); \
printf("PTX Version: %d\n", attr.ptxVersion); \
printf("Binary Version: %d\n", attr.binaryVersion);

0_Simple__cppOverload的更多相关文章

随机推荐

  1. Atlas框架介绍集成(一)

    Atlas是什么? Atlas是一个Android客户端容器框架,主要提供了组件化.动态性.解耦化的支持.支持在编码期.Apk运行期以及后续运维修复期的各种问题. 在工程期,实现工程独立开发,调试功能 ...

  2. ASP.NET Core 2.0 使用支付宝PC网站支付

    前言 最近在使用ASP.NET Core来进行开发,刚好有个接入支付宝支付的需求,百度了一下没找到相关的资料,看了官方的SDK以及Demo都还是.NET Framework的,所以就先根据官方SDK的 ...

  3. C++Builder中MessageBox的基本用法

    C++Builder中MessageBox的基本用法 返回值:IDYES=Application->MessageBox("","",MBYESNO) i ...

  4. Permutations 好题

    Permutations Time Limit: 20000/10000MS (Java/Others) Memory Limit: 128000/64000KB (Java/Others) Subm ...

  5. 跨主机使用 Rex-Ray volume - 每天5分钟玩转 Docker 容器技术(77)

    上一节我们在 docker1 上的 MySQL 容器中使用了 Rex-Ray volume mysqldata,更新了数据库.现在容器已经删除,今天将演示在 docker2 中重新使用这个卷. 在 d ...

  6. Iframe刷新页面

    window.parent.frames["name"].location="url";

  7. Java面向对象(封装性概论)

     Java面向对象(封装性概论) 知识概要:                   (1)面向对象概念 (2)类与对象的关系 (3)封装 (4)构造函数 (5)this关键字 (6)static关键 ...

  8. 一道javascript面试题(闭包与函数柯里化)

    要求写一个函数add(),分别实现能如下效果: (1)console.log(add(1)(2)(3)(4)()); (2)console.log(add(1,2)(3,4)()); (3)conso ...

  9. Echarts数据可视化series-heatmap热力图,开发全解+完美注释

    全栈工程师开发手册 (作者:栾鹏) Echarts数据可视化开发代码注释全解 Echarts数据可视化开发参数配置全解 6大公共组件详解(点击进入): title详解. tooltip详解.toolb ...

  10. TensorFlow问题:The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.

    1. 问题描述 The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available o ...