原子操作。并且在静态代码和运行时编译两种条件下使用。

▶ 源代码:静态使用

 #ifndef _SIMPLEATOMICS_KERNEL_H_
#define _SIMPLEATOMICS_KERNEL_H_
//#include "device_launch_parameters.h" __global__ void testKernel(int *g_odata)
{
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; // 算术运算原子指令
atomicAdd(&g_odata[], ); // 0号位加 10 atomicSub(&g_odata[], ); // 1号位减 10 atomicExch(&g_odata[], tid); // 2号位与 tid 号值交换(获得最后一个访问的 tid 号) atomicMax(&g_odata[], tid); // 3号位获得最大的 tid 号 atomicMin(&g_odata[], tid); // 4号位获得最小的 tid 号 atomicInc((unsigned int *)&g_odata[], ); // 5号位做模 17 的加法(g_odata[5] == 15 时加 1 得 16,再加 1 得 0) atomicDec((unsigned int *)&g_odata[], );// 6号位做模 137 的减法(g_odata[5] == 0 时减 1 得 136,再减 1 得 135) atomicCAS(&g_odata[], tid - , tid); // 7号位迭代 (g_odata[7] == tid - 1) ? tid : (g_odata[7]);
// 即以 g_odata[7] 初值为起点,增量为 1 的子序列的最大长度(一旦有增量为 1 的元素插到前面去该值就再也不变)
// 位运算原子指令
atomicAnd(&g_odata[], *tid+); // 8号位为 1,注意 (2k+7)%2 == 1 但 (2k+7)%(2^m) == 0 或 1,即仅最后一位能保证为 1 atomicOr(&g_odata[], << tid); // 9号位为 -1,所有的位均为 1 atomicXor(&g_odata[], tid); // 10号位为 255,注意异或运算具有交换律和结合律,硬算
} #endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
 /*simpleAtomicIntrinsics_cpu.cpp*/
#include <stdio.h> extern "C" int computeGold(int *gpuData, const int len); int computeGold(int *gpuData, const int len)
{
if (gpuData[] != * len)
{
printf("atomicAdd failed\n");
return false;
}
if (gpuData[] != - * len)
{
printf("atomicSub failed\n");
return false;
}
if (gpuData[] < || gpuData[] >= len)// gpuData[2] ∈ [0, len)
{
printf("atomicExch failed\n");
return false;
}
if (gpuData[] != len - )
{
printf("atomicMax failed\n");
return false;
}
if (gpuData[]!=)
{
printf("atomicMin failed\n");
return false;
}
if (gpuData[] != len % )
{
printf("atomicInc failed\n");
return false;
}
if (gpuData[] != - len % )
{
printf("atomicDec failed\n");
return false;
}
if (gpuData[] < || gpuData[] >= len)// gpuData[7] ∈ [0, len)
{
printf("atomicCAS failed\n");
return false;
}
if (gpuData[] != )
{
printf("atomicAnd failed\n");
return false;
}
if (gpuData[] != -)
{
printf("atomicOr failed\n");
return false;
}
if (gpuData[] != 0xff)
{
printf("atomicXor failed\n");
return false;
}
return true;
}
 #include <stdio.h>
#include <windows.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "simpleAtomicIntrinsics_kernel.cuh" #define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX extern "C" bool computeGold(int *gpuData, const int len); bool runTest()
{
bool testResult = false;
unsigned int numThreads = ;
unsigned int numBlocks = ;
unsigned int numData = ;
unsigned int memSize = sizeof(int) * numData; int *h_data = (int *) malloc(memSize);
for (unsigned int i = ; i < numData; h_data[i] = , i++); // 初始化为全零
h_data[] = h_data[] = 0xff; // 搞点非零值 int *d_data;
cudaMalloc((void **) &d_data, memSize);
cudaMemcpy(d_data, h_data, memSize, cudaMemcpyHostToDevice); // 输出运算前的结果
printf("\n\tBefore:");
for (int i = ; i < numData; i++)
printf("%8d,", h_data[i]);
printf("\n"); // 计算和计时
StopWatchInterface *timer;
sdkCreateTimer(&timer);
sdkStartTimer(&timer); testKernel << <numBlocks, numThreads >> > (d_data);
getLastCudaError("Kernel execution failed"); sdkStopTimer(&timer);
printf("\nProcessing time: %f ms\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer); cudaMemcpy(h_data, d_data, memSize, cudaMemcpyDeviceToHost); // 输出运算后的结果
printf("\n\tAfter :");
for (int i = ; i < numData; i++)
printf("%8d,", h_data[i]);
printf("\n"); testResult = computeGold(h_data, numThreads * numBlocks); free(h_data);
cudaFree(d_data); return testResult;
} int main()
{
bool testResult; printf("\n\tStarted!\n"); testResult = runTest(); printf("\n\tCompleted! main function returned %s\n", testResult ? "OK!" : "ERROR!");
getchar(); return ;
}

▶ 源代码:即时编译

 /*simpleAtomicIntrinsics_kernel.cuh 发生变化的地方*/
extern "C" __global__ void testKernel(int *g_odata)
 /*simpleAtomicIntrinsics_cpu.cpp 完全一样*/
 /*simpleAtomicIntrinsics.cpp*/
#include <stdio.h>
#include <windows.h>
#include <cuda_runtime.h>
#include <nvrtc_helper.h>
#include <helper_functions.h>// includes cuda.h and cuda_runtime_api.h #define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX extern "C" bool computeGold(int *gpuData, const int len); bool runTest()
{
bool testResult = false;
unsigned int numThreads = ;
unsigned int numBlocks = ;
unsigned int numData = ;
unsigned int memSize = sizeof(int) * numData; //即时编译过程
char *kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", NULL);
char *ptx;
size_t ptxSize;
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize);
CUmodule module = loadPTX(ptx, , NULL);
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "testKernel"); int *h_data = (int *) malloc(memSize);
for (unsigned int i = ; i < numData; h_data[i] = , i++);
h_data[] = h_data[] = 0xff; CUdeviceptr d_data;
cuMemAlloc(&d_data, memSize);
cuMemcpyHtoD(d_data, h_data, memSize); dim3 cudaBlockSize(numThreads,,);
dim3 cudaGridSize(numBlocks, , );
void *arr[] = { (void *)&d_data };
cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z,
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, , , &arr[], ); cuCtxSynchronize(); cuMemcpyDtoH(h_data, d_data, memSize); testResult = computeGold(h_data, numThreads * numBlocks); free(h_data);
cuMemFree(d_data); return testResult;
} int main()
{
bool testResult; printf("\n\tStarted!\n"); testResult = runTest(); printf("\n\tCompleted! main function returned %s\n", testResult ? "OK!" : "ERROR!");
getchar(); return ;
}

▶ 输出结果:

    Started!

    Before:       ,       ,       ,       ,       ,       ,       ,       ,     ,       ,     ,

Processing time: 0.035352 ms

    After :  , -,   ,   ,       ,      ,      ,       ,       ,      -,     ,

    Completed! main function returned OK!

▶ 涨姿势

● 一个有趣的数列:命 x0 = 0,xn = xn-1 XOR n,则有 x4n == 4n,x4n+1 = 1, x4n+2 == 4n+3, x4n+3 == 0。当改变初值的时候该表达式发生变化,结果如下图。三种颜色分别代表初始选作右边三个值的时候的结果。

● 解毒 device_atomic_functions.h 与原子操作。只保留了有效部分,去掉了注释和留白。

 #if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
#define __DEVICE_ATOMIC_FUNCTIONS_HPP__ #if defined(__CUDACC_RTC__) // 主机编译
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __host__ __device__
#else // 设备编译
#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
#endif #if defined(__cplusplus) && defined(__CUDACC__) #include "builtin_types.h"
#include "host_defines.h" // 整数原子加法。返回 *address 旧值,*address += val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
{
return __iAtomicAdd(address, val);
} // 无符号整数原子加法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
{
return __uAtomicAdd(address, val);
} // 整数原子减法,注意转换为加法来运算。返回 *address 旧值,*address -= val;。
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
{
return __iAtomicAdd(address, (unsigned int)-(int)val);
} // 无符号整数原子减法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
{
return __uAtomicAdd(address, (unsigned int)-(int)val);
} // 整数原子替换。返回 *address 旧值,*address = val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
{
return __iAtomicExch(address, val);
} // 无符号整数原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
{
return __uAtomicExch(address, val);
} // 浮点原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
{
return __fAtomicExch(address, val);
} // 整数原子取小。返回 *address 旧值,*address = MIN(*adress, val);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
{
return __iAtomicMin(address, val);
} // 无符号整数原子取小
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
{
return __uAtomicMin(address, val);
} // 整数原子取大。返回 *address 旧值,*address = MAX(*adress, val);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
{
return __iAtomicMax(address, val);
} // 无符号整数原子取大
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
{
return __uAtomicMax(address, val);
} // 无符号整数原子模加法。返回 *address 旧值,*address = (*adress + 1) % (val + 1);
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
{
return __uAtomicInc(address, val);
} // 无符号整数原子模减法。返回 *address 旧值,*address = (*adress + val) % (val + 1);
// 不用 (*adress - 1) 是为了把结果控制在 [0, val] 中,防止变成负数,这与 C 中的 % 运算不同
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
{
return __uAtomicDec(address, val);
} // 整数原子按位且。返回 *address 旧值,*adress &= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
{
return __iAtomicAnd(address, val);
} // 无符号整数原子按位且
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
{
return __uAtomicAnd(address, val);
} // 整数原子按位或。返回 *address 旧值,*adress |= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
{
return __iAtomicOr(address, val);
} // 无符号整数原子按位或
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
{
return __uAtomicOr(address, val);
} // 整数原子按位异或。返回 *address 旧值,*adress ^= val;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
{
return __iAtomicXor(address, val);
} // 无符号整数原子按位异或
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
{
return __uAtomicXor(address, val);
} // 整数原子比较赋值。返回 *address 旧值,*address = (*address == compare) ? val : *address;
__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
{
return __iAtomicCAS(address, compare, val);
} // 无符号整数原子比较赋值
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
{
return __uAtomicCAS(address, compare, val);
} // 无符号长整数原子加法
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
{
return __ullAtomicAdd(address, val);
} // 无符号长整数原子替换
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
{
return __ullAtomicExch(address, val);
} // 无符号长整数原子比较赋值
__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
{
return __ullAtomicCAS(address, compare, val);
} // 原子存在量词
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
{
return (bool)__any((int)cond);
} // 原子全称量词
__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
{
return (bool)__all((int)cond);
} #endif /* __cplusplus && __CUDACC__ */ #undef __DEVICE_ATOMIC_FUNCTIONS_DECL__ #endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */

● 原子操作函数声明在 device_functions.h 中。

  当设备计算能力 > 320 或 > 600 时开放各原子操作对应的 block 和 system 函数,并开放对应 long long、float、double 型数据的同一个函数,例如:

 #if !defined(__CUDACC_RTC__) || __CUDA_ARCH__ >= 600
__DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val); __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val);
#endif /* !__CUDACC_RTC__ || __CUDA_ARCH__ >= 600 */

  计算能力 600 以上能用的所有原子操作:

 #define __DEVICE_FUNCTIONS_STATIC_DECL__ __host__ __device__ __cudart_builtin__

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_block(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_system(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_block(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_system(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd(float *p, float val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_block(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_system(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_block(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_system(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch(float *p, float val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_block(float *p, float val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_system(float *p, float val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_block(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_system(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_block(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_system(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_block(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_system(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_block(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_system(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_block(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_system(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_block(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_system(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS(int *p, int compare, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_block(int *p, int compare, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_system(int *p, int compare, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS(unsigned int *p, unsigned int compare, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_block(unsigned int *p, unsigned int compare, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_system(unsigned int *p, unsigned int compare     unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS(unsigned long long int *p     unsigned long long int compare     unsigned long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_block(unsigned long long int *p     unsigned long long int compare     unsigned long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_system(unsigned long long int *p     unsigned long long int compare     unsigned long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_block(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_system(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicAnd(long long int *p, long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_block(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_system(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicAnd(unsigned long long int *p     unsigned long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_block(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_system(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_block(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_system(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicOr(long long int *p, long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_block(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_system(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicOr(unsigned long long int *p     unsigned long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_block(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_system(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_block(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_system(int *p, int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicXor(long long int *p, long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_block(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_system(long long *p, long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_block(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_system(unsigned int *p, unsigned int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicXor(unsigned long long int *p     unsigned long long int val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_block(unsigned long long *p, unsigned long long val);

 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_system(unsigned long long *p, unsigned long long val);

0_Simple__simpleAtomicIntrinsics + 0_Simple__simpleAtomicIntrinsics_nvrtc的更多相关文章

随机推荐

  1. oracle pl/sql 变量

    一.变量介绍在编写pl/sql程序时,可以定义变量和常量:在pl/sql程序中包括有:1).标量类型(scalar)2).复合类型(composite) --用于操作单条记录3).参照类型(refer ...

  2. BigDecimal的加减乘除及比较大小

    import java.math.BigDecimal; import static java.lang.System.out; public class BaseClass { public sta ...

  3. #define WIN32_LEAN_AND_MEAN

    不加载MFC所需的模块.用英语解释:Say no to MFC如果你的工程不使用MFC,就加上这句,这样一来在编译链接时,包括最后生成的一些供调试用的模块时,速度更快,容量更小.不过对于较大工程,MF ...

  4. hdu1251字典树递归算法

    统计难题 Time Limit: 4000/2000 MS (Java/Others)    Memory Limit: 131070/65535 K (Java/Others) Total Subm ...

  5. Asp.net MVC4高级编程学习笔记-视图学习第一课20171009

    首先解释下:本文只是对Asp.net MVC4高级编程这本书学习记录的学习笔记,书本内容感觉挺简单的,但学习容易忘记,因此在边看的同时边作下了笔记,可能其它朋友看的话没有情境和逻辑顺序还请谅解! 一. ...

  6. JFrame的层次结构以及背景颜色设置问题

    JFrame的层次结构: JFrame:窗体,也就是窗口的框架.默认为不可见.不透明的(可以使用isVisible和isOpaque来验证).创建窗口时,最后一步需要调用setVisible(true ...

  7. SQL中游标的用法

    游标:是用来对表从上下每行循环取值,将值连接成为字符串.例子:对 pubs 数据库的dbo.titles 表.1.取得表中的总价格:select sum(price) from dbo.titles2 ...

  8. Python自学笔记-map和reduce函数(来自廖雪峰的官网Python3)

    感觉廖雪峰的官网http://www.liaoxuefeng.com/里面的教程不错,所以学习一下,把需要复习的摘抄一下. 以下内容主要为了自己复习用,详细内容请登录廖雪峰的官网查看. Python内 ...

  9. Druid源码阅读之连接池

    概述 Druid是阿里巴巴开源的一个数据库连接池 源码地址.下面简单分析一下连接池是怎么实现的 怎么开始阅读 如果使用过Druid连接池的都只要在Spring配置中配置jdbc的时候配置Driver是 ...

  10. 使用wget做站点镜像及wget的高级用法

    本文为大家介绍 使用wget做站点镜像及wget的高级用法,供大家学习参考. # wget -r -p -np -k http://xxx.edu.cn -r 表示递归下载,会下载所有的链接,不过要注 ...