▶ 使用 cuda 内置结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量

▶ 源代码

 // cppOverload_kernel.cu
__global__ void simple_kernel(const int *pIn, int *pOut, int a)
__shared__ int sData[THREAD_N];
int tid = threadIdx.x + blockDim.x * blockIdx.x; sData[threadIdx.x] = pIn[tid];
pOut[tid] = sData[threadIdx.x] * a + tid;
} __global__ void simple_kernel(const int2 *pIn, int *pOut, int a)
__shared__ int2 sData[THREAD_N];
int tid = threadIdx.x + blockDim.x * blockIdx.x; sData[threadIdx.x] = pIn[tid];
pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid;
} __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a)
__shared__ int sData1[THREAD_N], sData2[THREAD_N];
int tid = threadIdx.x + blockDim.x * blockIdx.x; sData1[threadIdx.x] = pIn1[tid];
sData2[threadIdx.x] = pIn2[tid];
pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid;
 // cppOverload.cu
#include <stdio.h>
#include <helper_cuda.h>
#include <helper_math.h>
#include <helper_string.h> #define THREAD_N 256
#include "cppOverload_kernel.cu" // 源代码文件中使用了 THREAD_N,必须先定义 #define N 1024
#define DIV_UP(a, b) (((a) + (b) - 1) / (b))
#define OUTPUT_ATTR(attr) \
printf("Shared Size: %d\n", (int)attr.sharedSizeBytes); \
printf("Constant Size: %d\n", (int)attr.constSizeBytes); \
printf("Local Size: %d\n", (int)attr.localSizeBytes); \
printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \
printf("Number of Registers: %d\n", attr.numRegs); \
printf("PTX Version: %d\n", attr.ptxVersion); \
printf("Binary Version: %d\n", attr.binaryVersion); bool check_func1(int *hInput, int *hOutput, int a)
for (int i = ; i < N; ++i)
int cpuRes = hInput[i] * a + i;
if (hOutput[i] != cpuRes)
return false;
return true;
} bool check_func2(int2 *hInput, int *hOutput, int a)
for (int i = ; i < N; i++)
int cpuRes = (hInput[i].x + hInput[i].y)*a + i;
if (hOutput[i] != cpuRes)
return false;
return true;
} bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a)
for (int i = ; i < N; i++)
if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i)
return false;
return true;
} int main(int argc, const char *argv[])
int deviceID = cudaSetDevice(); int *hInput = NULL, *hOutput = NULL, *dInput = NULL, *dOutput = NULL;
cudaMalloc(&dInput, sizeof(int)*N * );
cudaMalloc(&dOutput, sizeof(int)*N);
cudaMallocHost(&hInput, sizeof(int)*N * );
cudaMallocHost(&hOutput, sizeof(int)*N); for (int i = ; i < N * ; i++)
hInput[i] = i;
cudaMemcpy(dInput, hInput, sizeof(int)*N * , cudaMemcpyHostToDevice); const int a = ;
void(*func1)(const int *, int *, int) = simple_kernel;
void(*func2)(const int2 *, int *, int) = simple_kernel;
void(*func3)(const int *, const int *, int *, int) = simple_kernel;
struct cudaFuncAttributes attr; // function 1
memset(&attr, , sizeof(attr));
cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared); // 运行前分析资源占用
cudaFuncGetAttributes(&attr, *func1);
(*func1) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dOutput, a);
cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n", check_func1(hInput, hOutput, a) ? "PASSED" : "FAILED"); // function 2
memset(&attr, , sizeof(attr));
cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared);
cudaFuncGetAttributes(&attr, *func2);
(*func2) << <DIV_UP(N, THREAD_N), THREAD_N >> >((int2 *)dInput, dOutput, a); // 强行转换成 int2*,反正也是对其的
cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n", check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a) ? "PASSED" : "FAILED"); // function 3
memset(&attr, , sizeof(attr));
cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared);
cudaFuncGetAttributes(&attr, *func3);
(*func3) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dInput + N, dOutput, a);
cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s\n\n", check_func3(&hInput[], &hInput[N], hOutput, a) ? "PASSED" : "FAILED"); cudaFree(dInput);
return ;

● 输出结果:

Shared Size:
Constant Size:
Local Size:
Max Threads Per Block:
Number of Registers:
PTX Version:
Binary Version:
simple_kernel(const int *pIn, int *pOut, int a) PASSED Shared Size:
Constant Size:
Local Size:
Max Threads Per Block:
Number of Registers:
PTX Version:
Binary Version:
simple_kernel(const int2 *pIn, int *pOut, int a) PASSED Shared Size:
Constant Size:
Local Size:
Max Threads Per Block:
Number of Registers:
PTX Version:
Binary Version:
simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED

▶ 涨姿势:

● cuda 使用扩展名为 .cuh 的头文件

● cuda内置结构 cudaFuncAttributes 的定义:

 struct __device_builtin__ cudaFuncAttributes
size_t sharedSizeBytes; // 共享内存大小
size_t constSizeBytees; // 常量内存大小
size_t localSizeBytes; // 局部内存大小
int maxThreadsPerBlock; // 每线程块线最大程数量
int numRegs; // 寄存器数量
int ptxVersion; // PTX版本号
int binaryVersion; // 机器码版本号
int cacheModeCA; // 是否使用编译指令 -Xptxas --dlcm=ca

● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量

 struct cudaFuncAttributes attr;
memset(&attr, , sizeof(attr));
cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared);
cudaFuncGetAttributes(&attr, *function);

■ 涉及的函数

 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);

 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
return cudaErrorUnknown;
} #define OUTPUT_ATTR(attr) \
printf("Shared Size: %d\n", (int)attr.sharedSizeBytes); \
printf("Constant Size: %d\n", (int)attr.constSizeBytes); \
printf("Local Size: %d\n", (int)attr.localSizeBytes); \
printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \
printf("Number of Registers: %d\n", attr.numRegs); \
printf("PTX Version: %d\n", attr.ptxVersion); \
printf("Binary Version: %d\n", attr.binaryVersion);



