CUDA C Programming Guide 在线教程学习笔记 Part 5

附录 A，CUDA计算设备

附录 B，C语言扩展

▶ 函数的标识符

● __device__，__global__ 和 __host__

● 宏 __CUDA_ARCH__ 可用于区分代码的运行位置.

 __host__ __device__ void fun()

 {

 # if __CUDA_ARCH__ >=

     // 代码运行于计算能力 6.x 设备

 #elif __CUDA_ARCH__ >= 500

     // 代码运行于计算能力 5.x 设备

 #elif __CUDA_ARCH__ >= 300

     // 代码运行于计算能力 3.x 设备

 #elif __CUDA_ARCH__ >= 200

     // 代码运行于计算能力 2.x 设备

 #elif !defined(__CUDA_ARCH__)

     // 代码运行于主机

 #endif

 }

● __noinline__ 和 __forceinlie__

■ __device__ 函数由便以其判断是否转化为内联函数。

■ __noinline__ 函数要求编译器尽量不转化为内联函数。

■ __forceinline__ 函数要求编译器尽量转化为内联函数。

■ __noline__ 和 __forceinline__ 不能共用，且不能放到 inline 函数的前面（已经内联的函数不能使用该标识符）。

▶ 变量的标识符

● __device__ 表明变量驻留在设备上，可与 __constant__ 或 __shared__ 共用，进一步表明变量的内存空间，若只有其一个标识符，则该变量满足：

■ 驻留在全局内存中。

■ 与创建该变量的 CUDA 上下文有相同的生命周期。

■ 在每台设备上有一个不同的对象。

■ 允许线程格中所有线程访问，也允许主机通过 Runtime API 访问（cudaGetSymbolAddress()，cudaGetSymbolSize()，cudaMemcpyToSymbol()，cudaMemcpyFromSymbol()）。

● __constant__ 可选与 __device__ 共用，该变量满足：

■ 驻留在常数内存空间。

■ 与创建该变量的 CUDA 上下文有相同的生命周期。

■ 在每台设备上有一个不同的对象。

■ 允许线程格中所有线程访问，也允许主机通过 Runtime API 访问（cudaGetSymbolAddress()，cudaGetSymbolSize()，cudaMemcpyToSymbol()，cudaMemcpyFromSymbol()）。

● __shared__ 可选与 __device__ 共用，该变量满足：

■ 驻留在对应线程块的共享内存空间。

■ 与对应线程块有相同的生命周期。

■ 在每个线程块上有一个不同的对象。

■ 只允许对应线程块中所有线程访问。

■ 外部共享内存数组的数据类型可以在函数内部发生变化，但要求按目标数据类型进行对齐。

 __device__ void func()

 {

     extern __shared__ float array[]; 

     short* array0 = (short*)array;

     int*   array1 = (int*)&array0[];      // 正确，对齐到 array 的 4B × k位置

     float* array2 = (float*)&array0[];    // 正确，对齐到 array 的 4B  × k位置

     short* array3 = (short*)&array0[];   // 错误，没有对齐到 array 的 4B  × k位置

 }

● __managed__ 可选与 __device__ 共用，该变量满足：

■ 可被设备和主机访问，能直接被主机或设备函数读写。

■ 具有程序生命周期。

● __restrict__ nvcc 支持的关键字。

■ 在程序员保证输入变量地址不重叠的情况下，可以提示编译器使用优化。减少内存访问次数和计算步数，但有可能增加需要的寄存器数量，造成负优化（CUDA 寄存器压力问题）。

 __device__ void func(const float* __restrict__ a, const float* __restrict__ b, float* __restrict__ c)

▶ 内建变量与内建变量类型

● 使用分量 x，y，z，w的形式将不超过四个同种类型的短变量放到一个长变量中去。

● 各种数据均使用 make_<type name>() 形式的函数来完成转化，压缩版的 vector_functions.h 定义了全部这样的函数。

 #if !defined(__VECTOR_FUNCTIONS_HPP__)

 #define __VECTOR_FUNCTIONS_HPP__

 #include "builtin_types.h"

 #include "host_defines.h"

 #include "vector_types.h"

     #if defined(__CUDACC_RTC__)

 #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__

     #else /* !__CUDACC_RTC__ */

 #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__

     #endif /* __CUDACC_RTC__ */

 __VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)

 {

     char1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)

 {

     uchar1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)

 {

     char2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)

 {

     uchar2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)

 {

     char3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)

 {

     uchar3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)

 {

     char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)

 {

     uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)

 {

     short1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)

 {

     ushort1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)

 {

     short2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)

 {

     ushort2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x, short y, short z)

 {

     short3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)

 {

     ushort3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)

 {

     short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)

 {

     ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)

 {

     int1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)

 {

     uint1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)

 {

     int2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)

 {

     uint2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)

 {

     int3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)

 {

     uint3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)

 {

     int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)

 {

     uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)

 {

     long1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)

 {

     ulong1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)

 {

     long2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)

 {

     ulong2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)

 {

     long3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)

 {

     ulong3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)

 {

     long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)

 {

     ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)

 {

     float1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)

 {

     float2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)

 {

     float3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)

 {

     float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)

 {

     longlong1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)

 {

     ulonglong1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)

 {

     longlong2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)

 {

     ulonglong2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)

 {

     longlong3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)

 {

     ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)

 {

     longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)

 {

     ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)

 {

     double1 t; t.x = x; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)

 {

     double2 t; t.x = x; t.y = y; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)

 {

     double3 t; t.x = x; t.y = y; t.z = z; return t;

 }

 __VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)

 {

     double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;

 }

 #undef __VECTOR_FUNCTIONS_DECL__

 #endif /* !__VECTOR_FUNCTIONS_HPP__ */

● dim3 类型，定义于 vector_types.h，用于声明线程格和线程块尺寸。基于 uint3 类型，加入了一个初始化函数。

 struct __device_builtin__ dim3

 {

     unsigned int x, y, z;

 #if defined(__cplusplus)

     __host__ __device__ dim3(unsigned int vx = , unsigned int vy = , unsigned int vz = ) : x(vx), y(vy), z(vz) {}

     __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}

     __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }

 #endif /* __cplusplus */

 };

● gridDim 和 blockDim，基于 dim3 类型，指明线程格和线程块尺寸。

● blockIdx 和 threadIdx，基于uint3 类型，指明线程块和线程编号。

● warpSize，int 类型，指明一个线程束中的线程数量。

▶ 内存栅栏函数

● CUDA 编程默认为弱有序模式，也就是说 CUDA 中的线程往共享内存、全局内存、页锁定内存或另一台设备的内存中的写入并不完全按照 CUDA 设备或主机代码的顺序进行。举例下面的代码，先初始化两个全局变量 X 和 Y，分别使用线程 1 和 2 运行两个设备函数，观察结果。在强有序模式下，线程 2 结果可能为：① A == 1，B == 2（完全抢先），② A == 10，B == 2（部分抢先），③ A ==10，B ==20（完全落后）。但是在弱有序模式中，还有可能是 ④ A == 1，B == 20（看起来 B 初始化在 A 之前），需要使用内存栅栏函数来迫使内存读写按照顺序进行。

 __device__ volatile int X = , Y = ;

 __device__ void writeXY()

 {

     X = ;

     Y = ;

 }

 __device__ void readXY()

 {

     int A = X;

     int B = Y;

 }

● __threadfence_block() 。

■ 在调用该函数以前，主调线程的所有内存写入操作都要被它所在线程块内所有线程确认，接着主调线程才能调用该函数，然后主调线程继续之后的所有内存写入操作。原文：All writes to all memory made by the calling thread before the call to __threadfence_block() are observed by all threads in the block of the calling thread as occurring before all writes to all memory made by the calling thread after the call to __threadfence_block().

■ 在调用该函数以前，主调线程的所有内存读取操作都是有序的，接着主调线程才能调用该函数，然后主调线程继续之后的所有内存读取操作。原文：All reads from all memory made by the calling thread before the call to __threadfence_block() are ordered before all reads from all memory made by the calling thread after the call to __threadfence_block().

● __threadfence() 。与 __threadfence_block 类似，注意内存栅栏函数只对一个线程的读写操作有效，为了保证其他线程对被操作数据的可视性（防止由于缓存而没有来的及更新的内存中的被操作数据），应该对被操作数据加上关键字 volatile 。原文：acts as __threadfence_block() for all threads in the block of the calling thread and also ensures that no writes to all memory made by the calling thread after the call to __threadfence() are observed by any thread in the device as occurring before any write to all memory made by the calling thread before the call to __threadfence(). Note that for this ordering guarantee to be true, the observing threads must truly observe the memory and not cached versions of it; this is ensured by using the volatile keyword as detailed in Volatile Qualifier.

● __threadfence_system() 。与 __threadfence_block 类似，但是确认的主体加上主机线程和其它设备上的所有线程。原文：acts as __threadfence_block() for all threads in the block of the calling thread and also ensures that all writes to all memory made by the calling thread before the call to __threadfence_system() are observed by all threads in the device, host threads, and all threads in peer devices as occurring before all writes to all memory made by the calling thread after the call to __threadfence_system().

● 在上面的例子中，通过分别在两个函数的两个语句中间加上内存栅栏函数，就能消灭情况 ④ 的发生。区别在于，如果线程 1 和线程 2 在同一个线程块中，则函数 __threadfence_block() 就可以了；如果线程 1 和线程 2 在同一台设备的不同线程块中，则需要使用函数 __threadfence()；如果线程 1 和线程 2 在不同的设备中，则只能使用函数 __threadfence_system() 。

 //  device_functions.hpp

 __DEVICE_FUNCTIONS_STATIC_DECL__ void __threadfence_block()

 {

   __nvvm_membar_cta();

 }

 __DEVICE_FUNCTIONS_STATIC_DECL__ void __threadfence()

 {

   __nvvm_membar_gl();

 }

 __DEVICE_FUNCTIONS_STATIC_DECL__ void __threadfence_system()

 {

   __nvvm_membar_sys();

 }

● 教程中举了一个使用内存栅栏函数的例子。使用多线程块对一维数组作规约求和，首先将数组分段到各线程块中作分段求和，然后把各线程块计算的结果用原子操作加到全局内存的输出变量上，同时维护一个整形计数变量来记录已经完成任务的线程块数目，最后一个完成的线程块读取计数发现其等于 gridDim - 1，从而完成最后的加法，然后输出结果。问题在于，如果不在原子加法和维护计数变量之间插入内存栅栏函数，则有可能计数变量已经等于 gridDim - 1 但输出变量还没有将前 gridDim - 1个线程的结果加到一起，此时下一个（可能不是最后一个）线程直接读取输出变量就进行加法和输出，导致错误。

 __device__ unsigned int count = ;

 __shared__ bool isLastBlockDone;

 __global__ void sum(const float* array, unsigned int N, volatile float* result)

 {

     // Each block sums a subset of the input array.

     float partialSum = calculatePartialSum(array, N);

     if (threadIdx.x == )

     {

         // 每个线程块的 0 号线程将部分和输出到数组 result 中

         // 使用了关键字 volatile， 不使用 L1 缓存来处理 result，保证内存操作可见性        

         // 写结果，栅栏，维护计数变量

         result[blockIdx.x] = partialSum;

         __threadfence();

         unsigned int value = atomicInc(&count, gridDim.x);

         // 满足 value == gridDim.x - 1 的线程块是最后一个

         isLastBlockDone = (value == (gridDim.x - ));

     }

     // 线程同步，保证所有的线程都得到了 value 值

     __syncthreads();

     // 最后一个线程块的 0 号线程输出总和，重置 count

     if (isLastBlockDone)

     {

         float totalSum = calculateTotalSum(result);

         if (threadIdx.x == )

         {

             result[] = totalSum;

             count = ;

         }

     }

 }

▶ 同步函数

● __syncthreads() 等待同一线程块内所有线程都达到该函数位置，且所有全局和共享内存对该线程块内所有线程都可见（缓存与内存已经同步）。以下三个扩展函数都具有这两项同步功能，并且附加了求值统计的功能。

●__syncthreads_count(int predicate) 同步，且返回参与同步的线程中，变量 predicate 非零的线程个数。

●__syncthreads_and(int predicate) 同步，且返回参与同步的线程中，所有变量 predicate 的逻辑且。

●__syncthreads_or(int predicate) 同步，且返回参与同步的线程中，所有变量 predicate 的逻辑或。

●__syncwarp(unsigned mask=0xffffffff) 正在执行的线程等待，直到有相同掩码的线程束通道都执行了该函数，然后各线程再继续往下执行。所有在掩码中标明的非活跃的线程，也都必须使用相同的掩码执行相应的 __syncwarp()，否则结果是未定义的。执行 __syncwarp() 保证了参与栅栏同步的的线程之间的顺序。因此，同一线程束内所有希望通过内存进行通信的线程，可以先写入到内存，再执行 __syncwarp()，然后读取线程束中中其他线程写入的值。(?) 对于 .target sm_6x 或更低的目标版本，掩码中的所有线程必须执行 __syncwarp()，且掩码中所有值的并集必须与活动掩码相等。否则，行为是未定义的。

 // device_functions.h

 __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void __syncthreads(void);

 // device_functions.hpp

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __syncthreads_count(int predicate)

 {

     return __nvvm_bar0_popc(predicate);

 }

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __syncthreads_and(int predicate)

 {

     return __nvvm_bar0_and(predicate);

 }

 __DEVICE_FUNCTIONS_STATIC_DECL__ int __syncthreads_or(int predicate)

 {

     return __nvvm_bar0_or(predicate);

 }

CUDA C Programming Guide 在线教程学习笔记 Part 5的更多相关文章

CUDA C Programming Guide 在线教程学习笔记 Part 4
▶ 图形互操作性,OpenGL 与 Direct3D 相关.(没学过,等待填坑) ▶ 版本号与计算能力 ● 计算能力(Compute Capability)表征了硬件规格,CUDA版本号表征了驱动接口 ...
CUDA C Programming Guide 在线教程学习笔记 Part 2
▶ 纹理内存使用 ● 纹理内存使用有两套 API,称为 Object API 和 Reference API .纹理对象(texture object)在运行时被 Object API 创建,同时指定 ...
CUDA C Programming Guide 在线教程学习笔记 Part 10【坑】
▶ 动态并行. ● 动态并行直接从 GPU 上创建工作,可以减少主机和设备间数据传输,在设备线程中调整配置.有数据依赖的并行工作可以在内核运行时生成,并利用 GPU 的硬件调度和负载均衡.动态并行要求 ...
CUDA C Programming Guide 在线教程学习笔记 Part 13
▶ 纹理内存访问补充(见纹理内存博客 http://www.cnblogs.com/cuancuancuanhao/p/7809713.html) ▶ 计算能力 ● 不同计算能力的硬件对计算特性的支持 ...
CUDA C Programming Guide 在线教程学习笔记 Part 9
▶ 协作组,要求 cuda ≥ 9.0,一个简单的例子见 http://www.cnblogs.com/cuancuancuanhao/p/7881093.html ● 灵活调节需要进行通讯的线程组合 ...
CUDA C Programming Guide 在线教程学习笔记 Part 8
▶ 线程束表决函数(Warp Vote Functions) ● 用于同一线程束内各线程通信和计算规约指标. // device_functions.h,cc < 9.0 __DEVICE_FU ...
CUDA C Programming Guide 在线教程学习笔记 Part 7
▶ 可缓存只读操作(Read-Only Data Cache Load Function),定义在 sm_32_intrinsics.hpp 中.从地址 adress 读取类型为 T 的函数返回,T ...
CUDA C Programming Guide 在线教程学习笔记 Part 3
▶ 表面内存使用 ● 创建 cuda 数组时使用标志 cudaArraySurfaceLoadStore 来创建表面内存,可以用表面对象(surface object)或表面引用(surface re ...
CUDA C Programming Guide 在线教程学习笔记 Part 1
1. 简介 2. 编程模型 ▶ SM version 指的是硬件构架和特性,CUDA version 指的是软件平台版本. 3. 编程接口.参考 http://chenrudan.github.io/ ...

随机推荐

M端错误提醒 -- pop 使用
JS: window.pop = {/*alert提示框 *@param title 提示的标题 *@param desc 提示的描述 *@param btnNum 按钮的数量,假如为1,则无视e2, ...
linux学习——sed工具
命令格式: sed [-nefr] [动作] 1.sed可以分析标准输入(STDIN)的数据,然后将数据处理后,再将他输出到标准输出(STDOUT),他有替换.删除.新增.选定特定行等处理功能.sed ...
Stones 优先队列
Because of the wrong status of the bicycle, Sempr begin to walk east to west every morning and walk ...
哈尔滨理工大学第七届程序设计竞赛初赛（BFS多队列顺序）
哈尔滨理工大学第七届程序设计竞赛初赛https://www.nowcoder.com/acm/contest/28#question D题wa了半天....(真真正正的半天) 其实D题本来就是一个简单 ...
CH4302 Interval GCD
题意 4302 Interval GCD 0x40「数据结构进阶」例题描述给定一个长度为N的数列A,以及M条指令 (N≤5*10^5, M<=10^5),每条指令可能是以下两种之一: &qu ...
day2-Iptables笔记
1. iptables防火墙简介 Iptables也叫netfilter是Linux下自带的一款免费且优秀的基于包过滤的防火墙工具,它的功能十分强大,使用非常灵活,可以对流入.流出.流经服务器的数 ...
struts2访问ServletAPI方式和获取参数的方式
一.访问ServletAPI的三种方式方式1:通过让Action类去实现感知接口. 此时项目依赖:servlet-api.jar. ServletRequestAware:感知HttpServlet ...
Eclipse使用前准备（转）
Eclipse的发布流程 M1 08/19/2009 M2 09/30/2009 M3 11/11/2009 M4 12/16/2009 M ...
MySQL--禁用账号和设置账号有效期
======================================================================= MySQL5.5/5.6版本在MySQL 5.7 版本 ...
ubuntu 14.04安装OVS虚拟OpenFlow交换机配置总结
一.安装OVS sudo apt-get install openvswitch-controller openvswitch-switch openvswitch-datapath-source ( ...

CUDA C Programming Guide 在线教程学习笔记 Part 5

CUDA C Programming Guide 在线教程学习笔记 Part 5的更多相关文章

随机推荐

热门专题