0_Simple__simpleAtomicIntrinsics + 0_Simple__simpleAtomicIntrinsics_nvrtc
原子操作。并且在静态代码和运行时编译两种条件下使用。
▶ 源代码:静态使用
#ifndef _SIMPLEATOMICS_KERNEL_H_
#define _SIMPLEATOMICS_KERNEL_H_
//#include "device_launch_parameters.h" __global__ void testKernel(int *g_odata)
{
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; // 算术运算原子指令
atomicAdd(&g_odata[], ); // 0号位加 10 atomicSub(&g_odata[], ); // 1号位减 10 atomicExch(&g_odata[], tid); // 2号位与 tid 号值交换(获得最后一个访问的 tid 号) atomicMax(&g_odata[], tid); // 3号位获得最大的 tid 号 atomicMin(&g_odata[], tid); // 4号位获得最小的 tid 号 atomicInc((unsigned int *)&g_odata[], ); // 5号位做模 17 的加法(g_odata[5] == 15 时加 1 得 16,再加 1 得 0) atomicDec((unsigned int *)&g_odata[], );// 6号位做模 137 的减法(g_odata[5] == 0 时减 1 得 136,再减 1 得 135) atomicCAS(&g_odata[], tid - , tid); // 7号位迭代 (g_odata[7] == tid - 1) ? tid : (g_odata[7]);
// 即以 g_odata[7] 初值为起点,增量为 1 的子序列的最大长度(一旦有增量为 1 的元素插到前面去该值就再也不变)
// 位运算原子指令
atomicAnd(&g_odata[], *tid+); // 8号位为 1,注意 (2k+7)%2 == 1 但 (2k+7)%(2^m) == 0 或 1,即仅最后一位能保证为 1 atomicOr(&g_odata[], << tid); // 9号位为 -1,所有的位均为 1 atomicXor(&g_odata[], tid); // 10号位为 255,注意异或运算具有交换律和结合律,硬算
} #endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
/*simpleAtomicIntrinsics_cpu.cpp*/
#include <stdio.h> extern "C" int computeGold(int *gpuData, const int len); int computeGold(int *gpuData, const int len)
{
if (gpuData[] != * len)
{
printf("atomicAdd failed\n");
return false;
}
if (gpuData[] != - * len)
{
printf("atomicSub failed\n");
return false;
}
if (gpuData[] < || gpuData[] >= len)// gpuData[2] ∈ [0, len)
{
printf("atomicExch failed\n");
return false;
}
if (gpuData[] != len - )
{
printf("atomicMax failed\n");
return false;
}
if (gpuData[]!=)
{
printf("atomicMin failed\n");
return false;
}
if (gpuData[] != len % )
{
printf("atomicInc failed\n");
return false;
}
if (gpuData[] != - len % )
{
printf("atomicDec failed\n");
return false;
}
if (gpuData[] < || gpuData[] >= len)// gpuData[7] ∈ [0, len)
{
printf("atomicCAS failed\n");
return false;
}
if (gpuData[] != )
{
printf("atomicAnd failed\n");
return false;
}
if (gpuData[] != -)
{
printf("atomicOr failed\n");
return false;
}
if (gpuData[] != 0xff)
{
printf("atomicXor failed\n");
return false;
}
return true;
}
#include <stdio.h>
#include <windows.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "simpleAtomicIntrinsics_kernel.cuh" #define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX extern "C" bool computeGold(int *gpuData, const int len); bool runTest()
{
bool testResult = false;
unsigned int numThreads = ;
unsigned int numBlocks = ;
unsigned int numData = ;
unsigned int memSize = sizeof(int) * numData; int *h_data = (int *) malloc(memSize);
for (unsigned int i = ; i < numData; h_data[i] = , i++); // 初始化为全零
h_data[] = h_data[] = 0xff; // 搞点非零值 int *d_data;
cudaMalloc((void **) &d_data, memSize);
cudaMemcpy(d_data, h_data, memSize, cudaMemcpyHostToDevice); // 输出运算前的结果
printf("\n\tBefore:");
for (int i = ; i < numData; i++)
printf("%8d,", h_data[i]);
printf("\n"); // 计算和计时
StopWatchInterface *timer;
sdkCreateTimer(&timer);
sdkStartTimer(&timer); testKernel << <numBlocks, numThreads >> > (d_data);
getLastCudaError("Kernel execution failed"); sdkStopTimer(&timer);
printf("\nProcessing time: %f ms\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer); cudaMemcpy(h_data, d_data, memSize, cudaMemcpyDeviceToHost); // 输出运算后的结果
printf("\n\tAfter :");
for (int i = ; i < numData; i++)
printf("%8d,", h_data[i]);
printf("\n"); testResult = computeGold(h_data, numThreads * numBlocks); free(h_data);
cudaFree(d_data); return testResult;
} int main()
{
bool testResult; printf("\n\tStarted!\n"); testResult = runTest(); printf("\n\tCompleted! main function returned %s\n", testResult ? "OK!" : "ERROR!");
getchar(); return ;
}
▶ 源代码:即时编译
/*simpleAtomicIntrinsics_kernel.cuh 发生变化的地方*/
extern "C" __global__ void testKernel(int *g_odata)
/*simpleAtomicIntrinsics_cpu.cpp 完全一样*/
/*simpleAtomicIntrinsics.cpp*/
#include <stdio.h>
#include <windows.h>
#include <cuda_runtime.h>
#include <nvrtc_helper.h>
#include <helper_functions.h>// includes cuda.h and cuda_runtime_api.h #define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX extern "C" bool computeGold(int *gpuData, const int len); bool runTest()
{
bool testResult = false;
unsigned int numThreads = ;
unsigned int numBlocks = ;
unsigned int numData = ;
unsigned int memSize = sizeof(int) * numData; //即时编译过程
char *kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", NULL);
char *ptx;
size_t ptxSize;
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize);
CUmodule module = loadPTX(ptx, , NULL);
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "testKernel"); int *h_data = (int *) malloc(memSize);
for (unsigned int i = ; i < numData; h_data[i] = , i++);
h_data[] = h_data[] = 0xff; CUdeviceptr d_data;
cuMemAlloc(&d_data, memSize);
cuMemcpyHtoD(d_data, h_data, memSize); dim3 cudaBlockSize(numThreads,,);
dim3 cudaGridSize(numBlocks, , );
void *arr[] = { (void *)&d_data };
cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z,
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, , , &arr[], ); cuCtxSynchronize(); cuMemcpyDtoH(h_data, d_data, memSize); testResult = computeGold(h_data, numThreads * numBlocks); free(h_data);
cuMemFree(d_data); return testResult;
} int main()
{
bool testResult; printf("\n\tStarted!\n"); testResult = runTest(); printf("\n\tCompleted! main function returned %s\n", testResult ? "OK!" : "ERROR!");
getchar(); return ;
}
▶ 输出结果:
Started! Before: , , , , , , , , , , , Processing time: 0.035352 ms After : , -, , , , , , , , -, , Completed! main function returned OK!
▶ 涨姿势
● 一个有趣的数列:命 x0 = 0,xn = xn-1 XOR n,则有 x4n == 4n,x4n+1 = 1, x4n+2 == 4n+3, x4n+3 == 0。当改变初值的时候该表达式发生变化,结果如下图。三种颜色分别代表初始选作右边三个值的时候的结果。
● 解毒 device_atomic_functions.h 与原子操作。只保留了有效部分,去掉了注释和留白。
#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
#define __DEVICE_ATOMIC_FUNCTIONS_HPP__ #if defined(__CUDACC_RTC__) // 主机编译
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __host__ __device__
#else // 设备编译
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
#endif #if defined(__cplusplus) && defined(__CUDACC__) #include "builtin_types.h"
#include "host_defines.h" // 整数原子加法。返回 *address 旧值,*address += val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
{
return __iAtomicAdd(address, val);
} // 无符号整数原子加法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
{
return __uAtomicAdd(address, val);
} // 整数原子减法,注意转换为加法来运算。返回 *address 旧值,*address -= val;。
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
{
return __iAtomicAdd(address, (unsigned int)-(int)val);
} // 无符号整数原子减法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
{
return __uAtomicAdd(address, (unsigned int)-(int)val);
} // 整数原子替换。返回 *address 旧值,*address = val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
{
return __iAtomicExch(address, val);
} // 无符号整数原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
{
return __uAtomicExch(address, val);
} // 浮点原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
{
return __fAtomicExch(address, val);
} // 整数原子取小。返回 *address 旧值,*address = MIN(*adress, val);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
{
return __iAtomicMin(address, val);
} // 无符号整数原子取小
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
{
return __uAtomicMin(address, val);
} // 整数原子取大。返回 *address 旧值,*address = MAX(*adress, val);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
{
return __iAtomicMax(address, val);
} // 无符号整数原子取大
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
{
return __uAtomicMax(address, val);
} // 无符号整数原子模加法。返回 *address 旧值,*address = (*adress + 1) % (val + 1);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
{
return __uAtomicInc(address, val);
} // 无符号整数原子模减法。返回 *address 旧值,*address = (*adress + val) % (val + 1);
// 不用 (*adress - 1) 是为了把结果控制在 [0, val] 中,防止变成负数,这与 C 中的 % 运算不同
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
{
return __uAtomicDec(address, val);
} // 整数原子按位且。返回 *address 旧值,*adress &= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
{
return __iAtomicAnd(address, val);
} // 无符号整数原子按位且
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
{
return __uAtomicAnd(address, val);
} // 整数原子按位或。返回 *address 旧值,*adress |= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
{
return __iAtomicOr(address, val);
} // 无符号整数原子按位或
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
{
return __uAtomicOr(address, val);
} // 整数原子按位异或。返回 *address 旧值,*adress ^= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
{
return __iAtomicXor(address, val);
} // 无符号整数原子按位异或
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
{
return __uAtomicXor(address, val);
} // 整数原子比较赋值。返回 *address 旧值,*address = (*address == compare) ? val : *address;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
{
return __iAtomicCAS(address, compare, val);
} // 无符号整数原子比较赋值
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
{
return __uAtomicCAS(address, compare, val);
} // 无符号长整数原子加法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
{
return __ullAtomicAdd(address, val);
} // 无符号长整数原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
{
return __ullAtomicExch(address, val);
} // 无符号长整数原子比较赋值
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
{
return __ullAtomicCAS(address, compare, val);
} // 原子存在量词
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
{
return (bool)__any((int)cond);
} // 原子全称量词
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
{
return (bool)__all((int)cond);
} #endif /* __cplusplus && __CUDACC__ */ #undef __DEVICE_ATOMIC_FUNCTIONS_DECL__ #endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
● 原子操作函数声明在 device_functions.h 中。
当设备计算能力 > 320 或 > 600 时开放各原子操作对应的 block 和 system 函数,并开放对应 long long、float、double 型数据的同一个函数,例如:
#if !defined(__CUDACC_RTC__) || __CUDA_ARCH__ >= 600
__DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val);
#endif /* !__CUDACC_RTC__ || __CUDA_ARCH__ >= 600 */
计算能力 600 以上能用的所有原子操作:
#define __DEVICE_FUNCTIONS_STATIC_DECL__ __host__ __device__ __cudart_builtin__ __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_block(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_system(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS(int *p, int compare, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_block(int *p, int compare, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_system(int *p, int compare, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS(unsigned int *p, unsigned int compare, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_block(unsigned int *p, unsigned int compare, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_system(unsigned int *p, unsigned int compare unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS(unsigned long long int *p unsigned long long int compare unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_block(unsigned long long int *p unsigned long long int compare unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_system(unsigned long long int *p unsigned long long int compare unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicAnd(long long int *p, long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicAnd(unsigned long long int *p unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicOr(long long int *p, long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicOr(unsigned long long int *p unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_system(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_block(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_system(int *p, int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicXor(long long int *p, long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_block(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_system(long long *p, long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_block(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_system(unsigned int *p, unsigned int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicXor(unsigned long long int *p unsigned long long int val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_block(unsigned long long *p, unsigned long long val); __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_system(unsigned long long *p, unsigned long long val);
0_Simple__simpleAtomicIntrinsics + 0_Simple__simpleAtomicIntrinsics_nvrtc的更多相关文章
随机推荐
- Spark任务流程笔记
Spark学习笔记总结 02. Spark任务流程 1. RDD的依赖关系 RDD和它依赖的父RDD(s)的关系有两种不同的类型,即窄依赖(narrow dependency)和宽依赖(wide de ...
- Ubuntu17.04安装wps
1.进入http://community.wps.cn/download/下载wps-office安装包,我下载的是dep的包. 2.进行安装执行命令:sudo dpkg -i wps-office_ ...
- JS之脚本延迟
自从开了博客,我就一下班回来匆匆吃完饭门一关等一开电脑一打开匆匆的研究东西,以至于朋友们都怀疑我是不是都得了自闭症 其实因为我有恐惧心理怕自己的技术哪天跟不上社会了,说到技术我觉得技术不求越新越好,但 ...
- Python扩展方法一二事
前言 跟着一个有强迫症的老板干活是一件极其幸福的事情(你懂的).最近碰到一个问题,简单的说就是对一个对象做出部分修改后仍然返回此对象,于是我就写了一个方法,老板看了之后只有一句话:不雅观,改成直接对此 ...
- 翻译连载 | 第 10 章:异步的函数式(上)-《JavaScript轻量级函数式编程》 |《你不知道的JS》姊妹篇
原文地址:Functional-Light-JS 原文作者:Kyle Simpson-<You-Dont-Know-JS>作者 关于译者:这是一个流淌着沪江血液的纯粹工程:认真,是 HTM ...
- Qt中的坐标系统
Qt使用统一的坐标系统来定位窗口部件的位置和大小. 以屏幕的左上角为原点即(0, 0)点,从左向右为x轴正向,从上向下为y轴正向,这整个屏幕的坐标系统就用来定位顶层窗口: 此外,窗口内部也有自己的坐标 ...
- 再起航,我的学习笔记之JavaScript设计模式28(委托模式)
## 委托模式 ### 概念介绍 **委托模式(Entrust): **多个对象接收并处理同一请求,他们将请求委托给另一个对象统一处理请求. ### 利用委托优化循环 如果我们有一个需求需要让用户点击 ...
- Max Consecutive Ones
Given a binary array, find the maximum number of consecutive 1s in this array. Example 1: Input: [1, ...
- ORALCE PL/SQL学习笔记
ORALCE PL/SQL学习笔记 详情见自己电脑的备份数据资料
- Socket简单实现数据交互及上传
声明:本文为原创,如需转载请说明出处:http://www.cnblogs.com/gudu1/p/7669175.html 首先为什么要写这个呢?因为在几个月之前我还使用Socket做一个小项目,而 ...