1_Utilities__deviceQuery + 1_Utilities__deviceQueryDrv + 1_Utilities_

使用 Runtime API 和 Driver API 检测设备相关属性。并检测了设备之间的拓扑以及主机与设备之间的拓扑（是否支持跨设备原子操作）。

▶ 源代码：Runtime API

 #include <memory>

 #include <iostream>

 #include <cuda_runtime.h>

 #include <helper_cuda.h>

 #if CUDART_VERSION < 5000

 #include <cuda.h>

 template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)// 将 Driver API 的获取属性函数放到模板中

 {

     CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);

     if (CUDA_SUCCESS != error)

     {

         fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);

         exit(EXIT_FAILURE);

     }

 }

 #endif

 int main()

 {

     printf("Start.\n");

     printf("\n\tCUDA Device Query (Runtime API) version (CUDART static linking)\n");

     int deviceCount;

     cudaError_t error_id;

     if ((error_id = cudaGetDeviceCount(&deviceCount)) != cudaSuccess)

     {

         printf("\ncudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));

         printf("\nResult = Fail\n");

         exit(EXIT_FAILURE);

     }

     printf("\nDetected %d CUDA Capable device(s)\n", deviceCount);

     int dev, driverVersion, runtimeVersion;

     for (dev = ; dev < deviceCount; ++dev)

     {

         cudaSetDevice(dev);

         cudaDeviceProp deviceProp;

         cudaGetDeviceProperties(&deviceProp, dev);

         printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

         cudaDriverGetVersion(&driverVersion);

         cudaRuntimeGetVersion(&runtimeVersion);

         printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",

             driverVersion / , (driverVersion % ) / , runtimeVersion / , (runtimeVersion % ) / );

         printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",

             (float)deviceProp.totalGlobalMem / 1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);

         printf("  Multiprocessors: %2d, CUDA Cores/MP: %3d        %d CUDA Cores\n",

             deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),

             _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

         printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);

 #if CUDART_VERSION >= 5000

         printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);

         printf("  Memory Bus Width:                              %d-bit\n", deviceProp.memoryBusWidth);

         if (deviceProp.l2CacheSize)

             printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);

 #else// 在CUDA 4.0 - 4.2 中，需要通过 Driver API 来访问相关属性

         int memoryClock;

         getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);

         printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);

         int memBusWidth;

         getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);

         printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);

         int L2CacheSize;

         getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

         if (L2CacheSize)

             printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);

 #endif

         printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",

             deviceProp.maxTexture1D, deviceProp.maxTexture2D[], deviceProp.maxTexture2D[],

             deviceProp.maxTexture3D[], deviceProp.maxTexture3D[], deviceProp.maxTexture3D[]);

         printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",

             deviceProp.maxTexture1DLayered[], deviceProp.maxTexture1DLayered[]);

         printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",

             deviceProp.maxTexture2DLayered[], deviceProp.maxTexture2DLayered[], deviceProp.maxTexture2DLayered[]);

         printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);

         printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);

         printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);

         printf("  Warp size:                                     %d\n", deviceProp.warpSize);

         printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);

         printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);

         printf("  Max dimension size of a thread block (x,y,z):  (%d, %d, %d)\n",

             deviceProp.maxThreadsDim[], deviceProp.maxThreadsDim[], deviceProp.maxThreadsDim[]);

         printf("  Max dimension size of a grid size    (x,y,z):  (%d, %d, %d)\n",

             deviceProp.maxGridSize[], deviceProp.maxGridSize[], deviceProp.maxGridSize[]);

         printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);

         printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);

         printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);

         printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");

         printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");

         printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");

         printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");

         printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

         printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ?

             "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");

 #endif

         printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");

         printf("  Supports Cooperative Kernel Launch:            %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");

         printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");

         printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);

         const char *sComputeMode[] =

         {

             "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",

             "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",

             "Prohibited (no host thread can use ::cudaSetDevice() with this device)",

             "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",

             "Unknown",

             NULL

         };

         printf("  Compute Mode: < %s >\n", sComputeMode[deviceProp.computeMode]);

     }

     if (deviceCount >= )// 多设备情形，找出最靠前的两张支持 P2P 的设备

     {

         cudaDeviceProp prop[];

         int gpuid[], count = , can_access_peer;

         for (int i = ; i < deviceCount; i++)// 在 gpuid 中记录支持 P2P 的设备编号

         {

             cudaGetDeviceProperties(&prop[i], i);

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)// Windows 系统需要安装 Tesla 计算集群驱动

             if ((prop[i].major >= ) && prop[i].tccDriver)

 #else

             if ((prop[i].major >= ))

 #endif

                 gpuid[count++] = i;

         }

         if (count >= )

         {

             for (int i = ; i < count - ; i++)

             {

                 for (int j = i + ; j < count; j++)

                 {

                     cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);

                     printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s\n",

                         prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j], can_access_peer ? "Yes" : "No");

                 }

             }

         }

     }

     // 设备环境总况

     printf("\n");

     std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";

     char cTemp[];

     sProfileString += ", NumDevs = ";// 设备数

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

     sprintf_s(cTemp, , "%d", deviceCount);

 #else

     sprintf(cTemp, "%d", deviceCount);

 #endif

     sProfileString += cTemp;

     sProfileString += ", CUDA Driver Version = ";// Driver 版本

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

     sprintf_s(cTemp, , "%d.%d", driverVersion / , (driverVersion % ) / );

 #else

     sprintf(cTemp, "%d.%d", driverVersion / , (driverVersion % ) / );

 #endif

     sProfileString += cTemp;

     sProfileString += ", CUDA Runtime Version = ";// Runtime 版本

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

     sprintf_s(cTemp, , "%d.%d", runtimeVersion / , (runtimeVersion % ) / );

 #else

     sprintf(cTemp, "%d.%d", runtimeVersion / , (runtimeVersion % ) / );

 #endif

     sProfileString += cTemp;

     printf("\n%s\n", sProfileString.c_str());

     printf("\nFinish: Result = Pass\n");

     getchar();

     return ;

 }

▶ 输出结果：

Start.

    CUDA Device Query (Runtime API) version (CUDART static linking)

Detected  CUDA Capable device(s)

Device : "GeForce GTX 1070"

  CUDA Driver Version / Runtime Version          9.0 / 9.0

  CUDA Capability Major/Minor version number:    6.1

  Total amount of global memory:                  MBytes ( bytes)

  Multiprocessors: , CUDA Cores/MP:          CUDA Cores

  GPU Max Clock rate:                             MHz (1.64 GHz)

  Memory Clock rate:                              Mhz

  Memory Bus Width:                              -bit

  L2 Cache Size:                                  bytes

  Maximum Texture Dimension Size (x,y,z)         1D=(), 2D=(, ), 3D=(, , )

  Maximum Layered 1D Texture Size, (num) layers  1D=(),  layers

  Maximum Layered 2D Texture Size, (num) layers  2D=(, ),  layers

  Total amount of constant memory:                bytes

  Total amount of shared memory per block:        bytes

  Total number of registers available per block:

  Warp size:

  Maximum number of threads per multiprocessor:

  Maximum number of threads per block:

  Max dimension size of a thread block (x,y,z):  (, , )

  Max dimension size of a grid size    (x,y,z):  (, , )

  Maximum memory pitch:                           bytes

  Texture alignment:                              bytes

  Concurrent copy and kernel execution:          Yes with  copy engine(s)

  Run time limit on kernels:                     Yes

  Integrated GPU sharing Host Memory:            No

  Support host page-locked memory mapping:       Yes

  Alignment requirement for Surfaces:            Yes

  Device has ECC support:                        Disabled

  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)

  Device supports Unified Addressing (UVA):      Yes

  Supports Cooperative Kernel Launch:            No

  Supports MultiDevice Co-op Kernel Launch:      No

  Device PCI Domain ID / Bus ID / location ID:    /  /

  Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, NumDevs = , CUDA Driver Version = 9.0, CUDA Runtime Version = 9.0

Finish: Result = Pass

▶ 源代码：Driver API

 #include <stdio.h>

 #include <cuda.h>

 #include <helper_cuda_drvapi.h>

 int main(int argc, char **argv)

 {

     printf("Start.\n");

     printf("CUDA Device Query (Driver API) version (CUDART static linking)\n");

     CUresult error_id;

     if ((error_id = cuInit()) != CUDA_SUCCESS)

     {

         printf("\ncuInit(0) returned %d\n-> %s\n", error_id, getCudaDrvErrorString(error_id));

         printf("\nResult = Fail\n");

         exit(EXIT_FAILURE);

     }

     int deviceCount = ;

     if ((error_id = cuDeviceGetCount(&deviceCount)) != CUDA_SUCCESS)

     {

         printf("\ncuDeviceGetCount returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));

         printf("\nResult = FAIL\n");

         exit(EXIT_FAILURE);

     }

     printf("\nDetected %d CUDA Capable device(s)\n", deviceCount);

     for (CUdevice dev = ; dev < deviceCount; ++dev)

     {

         char deviceName[];

         if ((error_id = cuDeviceGetName(deviceName, , dev)) != CUDA_SUCCESS)

         {

             printf("\ncuDeviceGetName returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));

             printf("\nResult = FAIL\n");

             exit(EXIT_FAILURE);

         }

         printf("\nDevice %d: \"%s\"\n", dev, deviceName);

         int driverVersion;

         cuDriverGetVersion(&driverVersion);

         printf("  CUDA Driver Version:                           %d.%d\n", driverVersion/, (driverVersion%)/);

         int major, minor;

         if ((error_id = cuDeviceComputeCapability(&major, &minor, dev)) != CUDA_SUCCESS)

         {

             printf("\ncuDeviceComputeCapability returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));

             printf("\nResult = FAIL\n");

             exit(EXIT_FAILURE);

         }

         printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);

         size_t totalGlobalMem;

         if ((error_id = cuDeviceTotalMem(&totalGlobalMem, dev)) != CUDA_SUCCESS)

         {

             printf("cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));

             printf("Result = FAIL\n");

             exit(EXIT_FAILURE);

         }

         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",

             (float)totalGlobalMem / 1048576.0f, (unsigned long long) totalGlobalMem);

         int multiProcessorCount;

         getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);

         printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",

             multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor), _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);

         int clockRate;

         getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);

         printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f, clockRate * 1e-6f);

         int memoryClock;

         getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);

         printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);

         int memBusWidth;

         getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);

         printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);

         int L2CacheSize;

         getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

         if (L2CacheSize)

             printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);

         int maxTex1D, maxTex2D[], maxTex3D[];

         getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);

         getCudaAttribute<int>(&maxTex2D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);

         getCudaAttribute<int>(&maxTex2D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);

         getCudaAttribute<int>(&maxTex3D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);

         getCudaAttribute<int>(&maxTex3D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);

         getCudaAttribute<int>(&maxTex3D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);

         printf("  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) 3D=(%d, %d, %d)\n",

             maxTex1D, maxTex2D[], maxTex2D[], maxTex3D[], maxTex3D[], maxTex3D[]);

         int  maxTex1DLayered[];

         getCudaAttribute<int>(&maxTex1DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);

         getCudaAttribute<int>(&maxTex1DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);

         printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n", maxTex1DLayered[], maxTex1DLayered[]);

         int  maxTex2DLayered[];

         getCudaAttribute<int>(&maxTex2DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);

         getCudaAttribute<int>(&maxTex2DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);

         getCudaAttribute<int>(&maxTex2DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);

         printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",

             maxTex2DLayered[], maxTex2DLayered[], maxTex2DLayered[]);

         int totalConstantMemory;

         getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);

         printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);

         int sharedMemPerBlock;

         getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);

         printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);

         int regsPerBlock;

         getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);

         printf("  Total number of registers available per block: %d\n", regsPerBlock);

         int warpSize;

         getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);

         printf("  Warp size:                                     %d\n", warpSize);

         int maxThreadsPerMultiProcessor;

         getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);

         printf("  Maximum number of threads per multiprocessor:  %d\n", maxThreadsPerMultiProcessor);

         int maxThreadsPerBlock;

         getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);

         printf("  Maximum number of threads per block:           %d\n", maxThreadsPerBlock);

         int blockDim[];

         getCudaAttribute<int>(&blockDim[], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);

         getCudaAttribute<int>(&blockDim[], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);

         getCudaAttribute<int>(&blockDim[], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);

         printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[], blockDim[], blockDim[]);

         int gridDim[];

         getCudaAttribute<int>(&gridDim[], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);

         getCudaAttribute<int>(&gridDim[], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);

         getCudaAttribute<int>(&gridDim[], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);

         printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n", gridDim[], gridDim[], gridDim[]);

         int textureAlign;

         getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);

         printf("  Texture alignment:                             %u bytes\n", textureAlign);

         int memPitch;

         getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);

         printf("  Maximum memory pitch:                          %u bytes\n", memPitch);

         int gpuOverlap;

         getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);

         int asyncEngineCount;

         getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);

         printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (gpuOverlap ? "Yes" : "No"), asyncEngineCount);

         int kernelExecTimeoutEnabled;

         getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);

         printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");

         int integrated;

         getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);

         printf("  Integrated GPU sharing Host Memory:            %s\n", integrated ? "Yes" : "No");

         int canMapHostMemory;

         getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);

         printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");

         int concurrentKernels;

         getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);

         printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");

         int surfaceAlignment;

         getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);

         printf("  Alignment requirement for Surfaces:            %s\n", surfaceAlignment ? "Yes" : "No");

         int eccEnabled;

         getCudaAttribute<int>(&eccEnabled,  CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);

         printf("  Device has ECC support:                        %s\n", eccEnabled ? "Enabled" : "Disabled");

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

         int tccDriver ;

         getCudaAttribute<int>(&tccDriver ,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);

         printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", tccDriver ?

             "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");

 #endif

         int unifiedAddressing;

         getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);

         printf("  Device supports Unified Addressing (UVA):      %s\n", unifiedAddressing ? "Yes" : "No");

         int cooperativeLaunch;

         getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);

         printf("  Supports Cooperative Kernel Launch:            %s\n", cooperativeLaunch ? "Yes" : "No");

         int cooperativeMultiDevLaunch;

         getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);

         printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");

         int pciDomainID, pciBusID, pciDeviceID;

         getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);

         getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);

         getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);

         printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);

         const char *sComputeMode[] =

         {

             "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",

             "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",

             "Prohibited (no host thread can use ::cudaSetDevice() with this device)",

             "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",

             "Unknown",

             NULL

         };

         int computeMode;

         getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);

         printf("  Compute Mode: < %s >\n", sComputeMode[computeMode]);

     }

     if (deviceCount >= )// 多设备情形

     {

         int gpuid[], count = , major, minor, tccDriver, can_access_peer;

         for (int i = ; i < deviceCount; i++)

         {

             cuDeviceComputeCapability(&major, &minor, i);

             getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

             if ((major >= ) && tccDriver)

 #else

             if ((major >= ))

 #endif

                 gpuid[count++] = i;

         }

         if (count >= )

         {

             char deviceName0[], deviceName1[];

             for (int i = ; i < count - ; i++)

             {

                 for (int j = i + ; j < count; j++)

                 {

                     cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);

                     cuDeviceGetName(deviceName0, , gpuid[i]);

                     cuDeviceGetName(deviceName1, , gpuid[j]);

                     printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s\n",

                         deviceName0, gpuid[i], deviceName1, gpuid[j], can_access_peer ? "Yes" : "No");

                 }

             }

         }

     }

     printf("\nFinish: Result = Pass\n");

     getchar();

     return ;

 }

▶ 输出结果：

Start.

CUDA Device Query (Driver API) version (CUDART static linking)

Detected  CUDA Capable device(s)

Device : "GeForce GTX 1070"

  CUDA Driver Version:                           9.0

  CUDA Capability Major/Minor version number:    6.1

  Total amount of global memory:                  MBytes ( bytes)

  () Multiprocessors, () CUDA Cores/MP:      CUDA Cores

  GPU Max Clock rate:                             MHz (1.64 GHz)

  Memory Clock rate:                              Mhz

  Memory Bus Width:                              -bit

  L2 Cache Size:                                  bytes

  Max Texture Dimension Sizes                    1D=() 2D=(, ) 3D=(, , )

  Maximum Layered 1D Texture Size, (num) layers  1D=(),  layers

  Maximum Layered 2D Texture Size, (num) layers  2D=(, ),  layers

  Total amount of constant memory:                bytes

  Total amount of shared memory per block:        bytes

  Total number of registers available per block:

  Warp size:

  Maximum number of threads per multiprocessor:

  Maximum number of threads per block:

  Max dimension size of a thread block (x,y,z): (, , )

  Max dimension size of a grid size (x,y,z):    (, , )

  Texture alignment:                              bytes

  Maximum memory pitch:                           bytes

  Concurrent copy and kernel execution:          Yes with  copy engine(s)

  Run time limit on kernels:                     Yes

  Integrated GPU sharing Host Memory:            No

  Support host page-locked memory mapping:       Yes

  Concurrent kernel execution:                   Yes

  Alignment requirement for Surfaces:            Yes

  Device has ECC support:                        Disabled

  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)

  Device supports Unified Addressing (UVA):      Yes

  Supports Cooperative Kernel Launch:            No

  Supports MultiDevice Co-op Kernel Launch:      No

  Device PCI Domain ID / Bus ID / location ID:    /  /

  Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Finish: Result = Pass

▶ 源代码：topologyQuery

 #include <cuda_runtime.h>

 #include <helper_cuda.h>

 #include <helper_functions.h>s

 int main()

 {

     int deviceCount;

     cudaGetDeviceCount(&deviceCount);

     for (int device1 = ; device1 < deviceCount - ; device1++)// 设备间拓扑

     {

         for (int device2 = device1 + ; device2 < deviceCount; device2++)

         {

             int perfRank = ;

             int atomicSupported = ;

             int accessSupported = ;

             cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2);

             cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);

             cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2);

             if (accessSupported)

             {

                 std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;

                 std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;

                 std::cout << "  * Perf Rank: " << perfRank << std::endl;

             }

         }

     }

     for (int device = ; device < deviceCount; device++)// 设备与主机间间拓扑

     {

         int atomicSupported;

         cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device);

         std::cout << "GPU" << device << " <-> CPU:" << std::endl;

         std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;

     }

     getchar();

     return ;

 }

▶ 输出结果：

GPU0 <-> CPU:

  * Atomic Supported: no

▶ 涨姿势：

● Runtime API 比 Driver API 写起来更简单，且能直接检测的内容不少于 Driver API。

● 用到的 Runtime API 函数和 Driver API 函数。

 // cuda_runtime_api.h

 extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);

 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);

 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);

 // cuda_device_runtime_api.h

 #define __NV_WEAK__ __declspec(nv_weak)

 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)

 // cuda.h

 CUresult CUDAAPI cuDeviceGetCount(int *count);

 CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);

 CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);

 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);

1_Utilities__deviceQuery + 1_Utilities__deviceQueryDrv + 1_Utilities__topologyQuery的更多相关文章

随机推荐

maven quick start
mvn archetype:generate -DgroupId=com.mycompany.app -DartifactId=my-app -DarchetypeArtifactId=maven-a ...
hibernate连接oracle12c数据库报：java.sql.SQLException: ORA-01017: 用户名/口令无效; 登录被拒绝。（用户名/口令在oracle客户端以及cmd命令都能登入）
报错信息: 2017-09-22 15:40:07,354 WARN [org.hibernate.cfg.SettingsFactory] - Could not obtain connection ...
（研）for循环的一个bug以及3个while循环的快排
在这个for循环中,只要有一次不满足,这个for循环将break掉 while(p->score>=90&&i<5) count++ //若有一次不满足的话,那么整个 ...
call和apply的意义和区别
区别在于 call 的第二个参数可以是任意类型,而apply的第二个参数必须是数组如 func.call(func1,var1,var2,var3)对应的apply写法为:func.apply(f ...
用vmware安装gho文件心得
在卡饭学到了不少知识,下面是我的一个心得分享,希望大家能用的上. 用vmware安装gho文件心得方法1:diskgenius+ghostexp用vm新建一个空白硬盘虚拟机, 记住虚拟机文件的存储位 ...
如何取出word文档里的图片
在生活当中,Word办公是必不可少的.但是在工作中也会遇到一些麻烦,比如说如何取出word文档里的图片呢?有的人会通过复制粘贴,通过画图保存,可是这种方法未免太繁琐了吧.下面我就来分享一下我的经验. ...
【转】linux下解压.bz2压缩文件
原文网址:http://zhidao.baidu.com/question/90378903.html tar-c: 建立压缩档案-x:解压-t:查看内容-r:向压缩归档文件末尾追加文件-u:更新原压 ...
mysql 中find_in_set()和in()用法比较
mysql 中find_in_set()和in()用法比较在mysql中in可以包括指定的数字,而find_in_set()用于特定的数据类型. find_in_set 函数使用方法个例子来说:有 ...
JS enter事件及数据不完整阻止下一步操作
阻止下一步操作: 1.return false; 2.e.preventDefault(); 但IE8不支持 //键盘事件|enter $(function () { document.onkeyd ...
【android】Android ADB 端口占用问题解决方案
解决ADB端口占用问题方式一5037为adb默认端口,若5037端口被占用,查看占用端口的进程PIDC:\Users\wwx229495>netstat -aon|findstr 5037 ...

1_Utilities__deviceQuery + 1_Utilities__deviceQueryDrv + 1_Utilities__topologyQuery

1_Utilities__deviceQuery + 1_Utilities__deviceQueryDrv + 1_Utilities__topologyQuery的更多相关文章

随机推荐

热门专题