使用 Runtime API 和 Driver API 检测设备相关属性。并检测了设备之间的拓扑以及主机与设备之间的拓扑(是否支持跨设备原子操作)。

▶ 源代码:Runtime API

 #include <memory>
#include <iostream>
#include <cuda_runtime.h>
#include <helper_cuda.h> #if CUDART_VERSION < 5000
#include <cuda.h> template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)// 将 Driver API 的获取属性函数放到模板中
{
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
if (CUDA_SUCCESS != error)
{
fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
#endif int main()
{
printf("Start.\n");
printf("\n\tCUDA Device Query (Runtime API) version (CUDART static linking)\n"); int deviceCount;
cudaError_t error_id;
if ((error_id = cudaGetDeviceCount(&deviceCount)) != cudaSuccess)
{
printf("\ncudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
printf("\nResult = Fail\n");
exit(EXIT_FAILURE);
}
printf("\nDetected %d CUDA Capable device(s)\n", deviceCount); int dev, driverVersion, runtimeVersion;
for (dev = ; dev < deviceCount; ++dev)
{
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion / , (driverVersion % ) / , runtimeVersion / , (runtimeVersion % ) / );
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n",
(float)deviceProp.totalGlobalMem / 1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
printf(" Multiprocessors: %2d, CUDA Cores/MP: %3d %d CUDA Cores\n",
deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 5000
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
if (deviceProp.l2CacheSize)
printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
#else// 在CUDA 4.0 - 4.2 中,需要通过 Driver API 来访问相关属性
int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
int memBusWidth;
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
if (L2CacheSize)
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
#endif
printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
deviceProp.maxTexture1D, deviceProp.maxTexture2D[], deviceProp.maxTexture2D[],
deviceProp.maxTexture3D[], deviceProp.maxTexture3D[], deviceProp.maxTexture3D[]);
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
deviceProp.maxTexture1DLayered[], deviceProp.maxTexture1DLayered[]);
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n",
deviceProp.maxTexture2DLayered[], deviceProp.maxTexture2DLayered[], deviceProp.maxTexture2DLayered[]);
printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
printf(" Warp size: %d\n", deviceProp.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
deviceProp.maxThreadsDim[], deviceProp.maxThreadsDim[], deviceProp.maxThreadsDim[]);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
deviceProp.maxGridSize[], deviceProp.maxGridSize[], deviceProp.maxGridSize[]);
printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch);
printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment);
printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ?
"TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
printf(" Supports Cooperative Kernel Launch: %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] =
{
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
"Unknown",
NULL
};
printf(" Compute Mode: < %s >\n", sComputeMode[deviceProp.computeMode]);
} if (deviceCount >= )// 多设备情形,找出最靠前的两张支持 P2P 的设备
{
cudaDeviceProp prop[];
int gpuid[], count = , can_access_peer; for (int i = ; i < deviceCount; i++)// 在 gpuid 中记录支持 P2P 的设备编号
{
cudaGetDeviceProperties(&prop[i], i);
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)// Windows 系统需要安装 Tesla 计算集群驱动
if ((prop[i].major >= ) && prop[i].tccDriver)
#else
if ((prop[i].major >= ))
#endif
gpuid[count++] = i;
}
if (count >= )
{
for (int i = ; i < count - ; i++)
{
for (int j = i + ; j < count; j++)
{
cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s\n",
prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j], can_access_peer ? "Yes" : "No");
}
}
}
} // 设备环境总况
printf("\n");
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
char cTemp[]; sProfileString += ", NumDevs = ";// 设备数
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, , "%d", deviceCount);
#else
sprintf(cTemp, "%d", deviceCount);
#endif
sProfileString += cTemp; sProfileString += ", CUDA Driver Version = ";// Driver 版本
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, , "%d.%d", driverVersion / , (driverVersion % ) / );
#else
sprintf(cTemp, "%d.%d", driverVersion / , (driverVersion % ) / );
#endif
sProfileString += cTemp; sProfileString += ", CUDA Runtime Version = ";// Runtime 版本
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, , "%d.%d", runtimeVersion / , (runtimeVersion % ) / );
#else
sprintf(cTemp, "%d.%d", runtimeVersion / , (runtimeVersion % ) / );
#endif
sProfileString += cTemp;
printf("\n%s\n", sProfileString.c_str()); printf("\nFinish: Result = Pass\n");
getchar();
return ;
}

▶ 输出结果:

Start.

    CUDA Device Query (Runtime API) version (CUDART static linking)

Detected  CUDA Capable device(s)

Device : "GeForce GTX 1070"
CUDA Driver Version / Runtime Version 9.0 / 9.0
CUDA Capability Major/Minor version number: 6.1
Total amount of global memory: MBytes ( bytes)
Multiprocessors: , CUDA Cores/MP: CUDA Cores
GPU Max Clock rate: MHz (1.64 GHz)
Memory Clock rate: Mhz
Memory Bus Width: -bit
L2 Cache Size: bytes
Maximum Texture Dimension Size (x,y,z) 1D=(), 2D=(, ), 3D=(, , )
Maximum Layered 1D Texture Size, (num) layers 1D=(), layers
Maximum Layered 2D Texture Size, (num) layers 2D=(, ), layers
Total amount of constant memory: bytes
Total amount of shared memory per block: bytes
Total number of registers available per block:
Warp size:
Maximum number of threads per multiprocessor:
Maximum number of threads per block:
Max dimension size of a thread block (x,y,z): (, , )
Max dimension size of a grid size (x,y,z): (, , )
Maximum memory pitch: bytes
Texture alignment: bytes
Concurrent copy and kernel execution: Yes with copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Model)
Device supports Unified Addressing (UVA): Yes
Supports Cooperative Kernel Launch: No
Supports MultiDevice Co-op Kernel Launch: No
Device PCI Domain ID / Bus ID / location ID: / /
Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > deviceQuery, CUDA Driver = CUDART, NumDevs = , CUDA Driver Version = 9.0, CUDA Runtime Version = 9.0 Finish: Result = Pass

▶ 源代码:Driver API

 #include <stdio.h>
#include <cuda.h>
#include <helper_cuda_drvapi.h> int main(int argc, char **argv)
{
printf("Start.\n");
printf("CUDA Device Query (Driver API) version (CUDART static linking)\n"); CUresult error_id;
if ((error_id = cuInit()) != CUDA_SUCCESS)
{
printf("\ncuInit(0) returned %d\n-> %s\n", error_id, getCudaDrvErrorString(error_id));
printf("\nResult = Fail\n");
exit(EXIT_FAILURE);
}
int deviceCount = ;
if ((error_id = cuDeviceGetCount(&deviceCount)) != CUDA_SUCCESS)
{
printf("\ncuDeviceGetCount returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
printf("\nResult = FAIL\n");
exit(EXIT_FAILURE);
}
printf("\nDetected %d CUDA Capable device(s)\n", deviceCount);
for (CUdevice dev = ; dev < deviceCount; ++dev)
{
char deviceName[];
if ((error_id = cuDeviceGetName(deviceName, , dev)) != CUDA_SUCCESS)
{
printf("\ncuDeviceGetName returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
printf("\nResult = FAIL\n");
exit(EXIT_FAILURE);
}
printf("\nDevice %d: \"%s\"\n", dev, deviceName);
int driverVersion;
cuDriverGetVersion(&driverVersion);
printf(" CUDA Driver Version: %d.%d\n", driverVersion/, (driverVersion%)/);
int major, minor;
if ((error_id = cuDeviceComputeCapability(&major, &minor, dev)) != CUDA_SUCCESS)
{
printf("\ncuDeviceComputeCapability returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
printf("\nResult = FAIL\n");
exit(EXIT_FAILURE);
}
printf(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor);
size_t totalGlobalMem;
if ((error_id = cuDeviceTotalMem(&totalGlobalMem, dev)) != CUDA_SUCCESS)
{
printf("cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
printf("Result = FAIL\n");
exit(EXIT_FAILURE);
}
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n",
(float)totalGlobalMem / 1048576.0f, (unsigned long long) totalGlobalMem);
int multiProcessorCount;
getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor), _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
int clockRate;
getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f, clockRate * 1e-6f);
int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
int memBusWidth;
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
if (L2CacheSize)
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
int maxTex1D, maxTex2D[], maxTex3D[];
getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
getCudaAttribute<int>(&maxTex2D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
getCudaAttribute<int>(&maxTex2D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
getCudaAttribute<int>(&maxTex3D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
getCudaAttribute<int>(&maxTex3D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
getCudaAttribute<int>(&maxTex3D[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
printf(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) 3D=(%d, %d, %d)\n",
maxTex1D, maxTex2D[], maxTex2D[], maxTex3D[], maxTex3D[], maxTex3D[]);
int maxTex1DLayered[];
getCudaAttribute<int>(&maxTex1DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
getCudaAttribute<int>(&maxTex1DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", maxTex1DLayered[], maxTex1DLayered[]);
int maxTex2DLayered[];
getCudaAttribute<int>(&maxTex2DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
getCudaAttribute<int>(&maxTex2DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
getCudaAttribute<int>(&maxTex2DLayered[], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n",
maxTex2DLayered[], maxTex2DLayered[], maxTex2DLayered[]);
int totalConstantMemory;
getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory);
int sharedMemPerBlock;
getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock);
int regsPerBlock;
getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
printf(" Total number of registers available per block: %d\n", regsPerBlock);
int warpSize;
getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
printf(" Warp size: %d\n", warpSize);
int maxThreadsPerMultiProcessor;
getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor);
int maxThreadsPerBlock;
getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock);
int blockDim[];
getCudaAttribute<int>(&blockDim[], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
getCudaAttribute<int>(&blockDim[], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
getCudaAttribute<int>(&blockDim[], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[], blockDim[], blockDim[]);
int gridDim[];
getCudaAttribute<int>(&gridDim[], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
getCudaAttribute<int>(&gridDim[], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
getCudaAttribute<int>(&gridDim[], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", gridDim[], gridDim[], gridDim[]);
int textureAlign;
getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
printf(" Texture alignment: %u bytes\n", textureAlign);
int memPitch;
getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
printf(" Maximum memory pitch: %u bytes\n", memPitch);
int gpuOverlap;
getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
int asyncEngineCount;
getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
int kernelExecTimeoutEnabled;
getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
int integrated;
getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No");
int canMapHostMemory;
getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No");
int concurrentKernels;
getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No");
int surfaceAlignment;
getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No");
int eccEnabled;
getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
printf(" Device has ECC support: %s\n", eccEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
int tccDriver ;
getCudaAttribute<int>(&tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", tccDriver ?
"TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
int unifiedAddressing;
getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No");
int cooperativeLaunch;
getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
printf(" Supports Cooperative Kernel Launch: %s\n", cooperativeLaunch ? "Yes" : "No");
int cooperativeMultiDevLaunch;
getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
int pciDomainID, pciBusID, pciDeviceID;
getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID); const char *sComputeMode[] =
{
"Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
"Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this device)",
"Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
"Unknown",
NULL
};
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
printf(" Compute Mode: < %s >\n", sComputeMode[computeMode]);
} if (deviceCount >= )// 多设备情形
{
int gpuid[], count = , major, minor, tccDriver, can_access_peer;
for (int i = ; i < deviceCount; i++)
{
cuDeviceComputeCapability(&major, &minor, i);
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
if ((major >= ) && tccDriver)
#else
if ((major >= ))
#endif
gpuid[count++] = i;
}
if (count >= )
{
char deviceName0[], deviceName1[];
for (int i = ; i < count - ; i++)
{
for (int j = i + ; j < count; j++)
{
cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
cuDeviceGetName(deviceName0, , gpuid[i]);
cuDeviceGetName(deviceName1, , gpuid[j]);
printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s\n",
deviceName0, gpuid[i], deviceName1, gpuid[j], can_access_peer ? "Yes" : "No");
}
}
}
} printf("\nFinish: Result = Pass\n");
getchar();
return ;
}

▶ 输出结果:

Start.
CUDA Device Query (Driver API) version (CUDART static linking) Detected CUDA Capable device(s) Device : "GeForce GTX 1070"
CUDA Driver Version: 9.0
CUDA Capability Major/Minor version number: 6.1
Total amount of global memory: MBytes ( bytes)
() Multiprocessors, () CUDA Cores/MP: CUDA Cores
GPU Max Clock rate: MHz (1.64 GHz)
Memory Clock rate: Mhz
Memory Bus Width: -bit
L2 Cache Size: bytes
Max Texture Dimension Sizes 1D=() 2D=(, ) 3D=(, , )
Maximum Layered 1D Texture Size, (num) layers 1D=(), layers
Maximum Layered 2D Texture Size, (num) layers 2D=(, ), layers
Total amount of constant memory: bytes
Total amount of shared memory per block: bytes
Total number of registers available per block:
Warp size:
Maximum number of threads per multiprocessor:
Maximum number of threads per block:
Max dimension size of a thread block (x,y,z): (, , )
Max dimension size of a grid size (x,y,z): (, , )
Texture alignment: bytes
Maximum memory pitch: bytes
Concurrent copy and kernel execution: Yes with copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Concurrent kernel execution: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Model)
Device supports Unified Addressing (UVA): Yes
Supports Cooperative Kernel Launch: No
Supports MultiDevice Co-op Kernel Launch: No
Device PCI Domain ID / Bus ID / location ID: / /
Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > Finish: Result = Pass

▶ 源代码:topologyQuery

 #include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>s int main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
for (int device1 = ; device1 < deviceCount - ; device1++)// 设备间拓扑
{
for (int device2 = device1 + ; device2 < deviceCount; device2++)
{
int perfRank = ;
int atomicSupported = ;
int accessSupported = ;
cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2);
cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2);
if (accessSupported)
{
std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
std::cout << " * Perf Rank: " << perfRank << std::endl;
}
}
}
for (int device = ; device < deviceCount; device++)// 设备与主机间间拓扑
{
int atomicSupported;
cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device);
std::cout << "GPU" << device << " <-> CPU:" << std::endl;
std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
}
getchar();
return ;
}

▶ 输出结果:

GPU0 <-> CPU:
* Atomic Supported: no

▶ 涨姿势:

● Runtime API 比 Driver API 写起来更简单,且能直接检测的内容不少于 Driver API。

● 用到的 Runtime API 函数和 Driver API 函数。

 // cuda_runtime_api.h
extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice); // cuda_device_runtime_api.h
#define __NV_WEAK__ __declspec(nv_weak)
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) // cuda.h
CUresult CUDAAPI cuDeviceGetCount(int *count);
CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);

1_Utilities__deviceQuery + 1_Utilities__deviceQueryDrv + 1_Utilities__topologyQuery的更多相关文章

随机推荐

  1. poj2387 最短路

    题意:给出一堆双向路,求从N点到1点的最短路径,最裸的最短路径,建完边之后直接跑dij或者spfa就行 dij: #include<stdio.h> #include<string. ...

  2. gevent和tornado异步

    阅读目录 从 Tornado 说起 再来看下 Gevent 总要总结一下 原文:http://www.pywave.com/2012/08/17/about-gevent-and-tornado/ 还 ...

  3. knowledge 开源知识管理系统

    knowledge 是一个不错的知识管理系统,基于markdown 我们可以方便的进行知识的标签 以及展示 使用docker-compose 运行 环境准备 docker-compose 文件 ver ...

  4. grandstack 基于graphql&&react&& apollo&& neo4j 的全栈开发工具

    grandstack是一个基于graphql&&react&& apollo&& neo4j 的全栈开发工具. 有篇关于graphql 的5个常见问题的 ...

  5. Android已有的原生Camera框架中加入自己的API的实现方案。

    版权声明:本文为CSDN博主(天才2012)原创文章.未经博主同意不得转载. https://blog.csdn.net/gzzaigcn/article/details/25707389     在 ...

  6. BAT调用7z压缩程序

    @echo offset zip=C:\Program Files\7-Zip\7z.exeset timestamp=%date:~6,4%-%date:~0,2%-%date:~3,2%set d ...

  7. JUC集合之 ArrayBlockingQueue

    ArrayBlockingQueue介绍 ArrayBlockingQueue是数组实现的线程安全的有界的阻塞队列. 线程安全是指,ArrayBlockingQueue内部通过"互斥锁&qu ...

  8. 【linux】vim/vi常用指令

    0或者"Home”键:光标转移到此段的最前面字节处. $或者"End"键:光标转移到此段的最后面字节处. n<space>:光标向后移动n个字节. n< ...

  9. 如何使用swingbench进行oracle数据库压力测试

    如何使用swingbench进行oracle数据库压力测试 2014-10-06 08:09:02 标签:oracle 数据库压力测试 swingbench 原创作品,允许转载,转载时请务必以超链接形 ...

  10. gcc gdb调试 (二)

    GDB的命令概貌——————— 启动gdb后,就你被带入gdb的调试环境中,就可以使用gdb的命令开始调试程序了,gdb的命令可以使用help命令来查看,如下所示: /home/hchen> g ...