以下面这个例子作为教程,实现功能是element-wise add



// mathutil_cuda_kernel.cu
// 头文件,最后一个是cuda特有的
#include <curand.h>
#include <stdio.h>
#include <math.h>
#include <float.h>
#include "mathutil_cuda_kernel.h" // 获取GPU线程通道信息
dim3 cuda_gridsize(int n)
int k = (n - ) / BLOCK + ;
int x = k;
int y = ;
if(x > ) {
x = ceil(sqrt(k));
y = (n - ) / (x * BLOCK) + ;
dim3 d(x, y, );
return d;
// 这个函数是cuda执行函数,可以看到细化到了每一个元素
__global__ void broadcast_sum_kernel(float *a, float *b, int x, int y, int size)
int i = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
if(i >= size) return;
int j = i % x; i = i / x;
int k = i % y;
a[IDX2D(j, k, y)] += b[k];
} // 这个函数是与c语言函数链接的接口函数
void broadcast_sum_cuda(float *a, float *b, int x, int y, cudaStream_t stream)
int size = x * y;
cudaError_t err; // 上面定义的函数
broadcast_sum_kernel<<<cuda_gridsize(size), BLOCK, , stream>>>(a, b, x, y, size); err = cudaGetLastError();
if (cudaSuccess != err)
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
#define _MATHUTIL_CUDA_KERNEL #define IDX2D(i, j, dj) (dj * i + j)
#define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk)) #define BLOCK 512
#define MAX_STREAMS 512 #ifdef __cplusplus
extern "C" {
#endif void broadcast_sum_cuda(float *a, float *b, int x, int y, cudaStream_t stream); #ifdef __cplusplus
#endif #endif


// mathutil_cuda.c
// THC是pytorch底层GPU库
#include <THC/THC.h>
#include "mathutil_cuda_kernel.h" extern THCState *state; int broadcast_sum(THCudaTensor *a_tensor, THCudaTensor *b_tensor, int x, int y)
float *a = THCudaTensor_data(state, a_tensor);
float *b = THCudaTensor_data(state, b_tensor);
cudaStream_t stream = THCState_getCurrentStream(state); // 这里调用之前在cuda中编写的接口函数
broadcast_sum_cuda(a, b, x, y, stream); return ;
int broadcast_sum(THCudaTensor *a_tensor, THCudaTensor *b_tensor, int x, int y);


nvcc -c -o mathutil_cuda_kernel.cu.o mathutil_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52
import os
import torch
from torch.utils.ffi import create_extension this_file = os.path.dirname(__file__) sources = []
headers = []
defines = []
with_cuda = False if torch.cuda.is_available():
print('Including CUDA code.')
sources += ['src/mathutil_cuda.c']
headers += ['src/mathutil_cuda.h']
defines += [('WITH_CUDA', None)]
with_cuda = True this_file = os.path.dirname(os.path.realpath(__file__)) extra_objects = ['src/mathutil_cuda_kernel.cu.o'] # 这里是编译好后的.o文件位置
extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] ffi = create_extension(
) if __name__ == '__main__':


from _ext import cuda_util  #从对应路径中调用编译好的模块

a = torch.randn(3, 5).cuda()
b = torch.randn(3, 1).cuda()
mathutil.broadcast_sum(a, b, *map(int, a.size())) # 上面等价于下面的效果: a = torch.randn(3, 5)
b = torch.randn(3, 1)
a += b


