使用三种模式(QUICK_MODE,RANGE_MODE,SHMOO_MODE),测试三种拷贝情况下的显存带宽(HostToDevice,DeviceToHost,DeviceToDevice)

▶ 源代码

 #include <iostream>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <helper_cuda.h>
#include <helper_functions.h> #define MEMCOPY_ITERATIONS 100
#define DEFAULT_SIZE ( 32 * ( 1 << 20 ) )//32 M
#define DEFAULT_INCREMENT (1 << 22) //4 M
#define CACHE_CLEAR_SIZE (1 << 24) //16 M
#define SHMOO_MEMSIZE_MAX (1 << 26) //64 M
#define SHMOO_MEMSIZE_START (1 << 10) //1 KB
#define SHMOO_INCREMENT_1KB (1 << 10) //1 KB
#define SHMOO_INCREMENT_2KB (1 << 11) //2 KB
#define SHMOO_INCREMENT_10KB (10 * (1 << 10)) //10KB
#define SHMOO_INCREMENT_100KB (100 * (1 << 10)) //100 KB
#define SHMOO_INCREMENT_1MB (1 << 20) //1 MB
#define SHMOO_INCREMENT_2MB (1 << 21) //2 MB
#define SHMOO_INCREMENT_4MB (1 << 22) //4 MB
#define SHMOO_LIMIT_20KB (20 * (1 << 10)) //20 KB
#define SHMOO_LIMIT_50KB (50 * (1 << 10)) //50 KB
#define SHMOO_LIMIT_100KB (100 * (1 << 10)) //100 KB
#define SHMOO_LIMIT_1MB (1 << 20) //1 MB
#define SHMOO_LIMIT_16MB (1 << 24) //16 MB
#define SHMOO_LIMIT_32MB (1 << 25) //32 MB
#define DEBUG static bool bDontUseGPUTiming;
const char *sMemoryCopyKind[] = { "Device to Host", "Host to Device", "Device to Device", NULL };
const char *sMemoryMode[] = { "PINNED", "PAGEABLE", NULL };
enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE }; void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs)
{
int i;
printf("\n%s Bandwidth, %i Device(s), %s Memory Transfers\n", sMemoryCopyKind[kind], iNumDevs, sMemoryMode[memMode]);
printf(" Transfer Size: Bytes\tBandwidth: MB/s\n");
for (i = ; i < count; i++)
printf(" %u\t\t\t%.1f\n", memSizes[i], bandwidths[i]);
} void printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs, bool wc)
{
std::string sConfig;
if (kind == DEVICE_TO_DEVICE)
sConfig += "-D2D";
else
{
if (kind == DEVICE_TO_HOST)
sConfig += "-D2H";
else if (kind == HOST_TO_DEVICE)
sConfig += "-H2D"; if (memMode == PAGEABLE)
sConfig += "-Paged";
else if (memMode == PINNED)
{
sConfig += "-Pinned";
if (wc)
sConfig += "-WriteCombined";
}
}
for (int i = ; i < count; i++)
printf("BandwidthTest %s, Bandwidth = %.1f MB/s, Time = %.5f s, Size = %u bytes, NumDevsUsed = %d\n",
sConfig.c_str(), bandwidths[i], (double)memSizes[i] / (bandwidths[i] * (double)( << )), memSizes[i], iNumDevs);
} void printHelp(void)
{
printf("Usage: bandwidthTest [OPTION]...\n");
printf("Test the bandwidth for device to host, host to device, and device to device transfers\n");
printf("\n");
printf("Example: measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n");
printf("\n");
printf("Options:\n");
printf("--help\tDisplay this help menu\n");
printf("--csv\tPrint results as a CSV\n");
printf("--device=[deviceno]\tSpecify the device device to be used\n");
printf(" all - compute cumulative bandwidth on all the devices\n");
printf(" 0,1,2,...,n - Specify any particular device to be used\n");
printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
printf(" pageable - pageable memory\n");
printf(" pinned - non-pageable system memory\n");
printf("--mode=[MODE]\tSpecify the mode to use\n");
printf(" quick - performs a quick measurement\n");
printf(" range - measures a user-specified range of values\n");
printf(" shmoo - performs an intense shmoo of a large range of values\n");
printf("--htod\tMeasure host to device transfers\n");
printf("--dtoh\tMeasure device to host transfers\n");
printf("--dtod\tMeasure device to device transfers\n");
#if CUDART_VERSION >= 2020
printf("--wc\tAllocate pinned memory as write-combined\n");
#endif
printf("--cputiming\tForce CPU-based timing always\n");
printf("Range mode options\n");
printf("--start=[SIZE]\tStarting transfer size in bytes\n");
printf("--end=[SIZE]\tEnding transfer size in bytes\n");
printf("--increment=[SIZE]\tIncrement size in bytes\n");
return;
} float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
{
float elapsedTime = 0.0f;
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop); unsigned char *h_data, *d_data;
if (memMode == PINNED)// 使用页锁定内存或者可分页内存
{
if (CUDART_VERSION >= )
{
cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : );
cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : );
}
else
{
cudaMallocHost((void **)&h_data, memSize);
cudaMallocHost((void **)&h_data, memSize);
}
}
else
{
h_data = (unsigned char *)malloc(memSize);// 先放点东西到设备内存中,在收回的时候测试时间
if (h_data == NULL || h_data == NULL)
{
fprintf(stderr, "\nNo host memory to run testDeviceToHostTransfer\n");
exit(EXIT_FAILURE);
}
}
for (unsigned int i = ; i < memSize / sizeof(unsigned char); i++)
h_data[i] = (unsigned char)(i & 0xff);
cudaMalloc((void **)&d_data, memSize);
cudaMemcpy(d_data, h_data, memSize, cudaMemcpyHostToDevice); sdkStartTimer(&timer);
cudaEventRecord(start, );
if (memMode == PINNED)
{
for (unsigned int i = ; i < MEMCOPY_ITERATIONS; i++)
cudaMemcpyAsync(h_data, d_data, memSize, cudaMemcpyDeviceToHost, );
}
else
{
for (unsigned int i = ; i < MEMCOPY_ITERATIONS; i++)
cudaMemcpy(h_data, d_data, memSize,cudaMemcpyDeviceToHost);
}
cudaEventRecord(stop, );
cudaDeviceSynchronize();
sdkStopTimer(&timer);
cudaEventElapsedTime(&elapsedTime, start, stop);
if (memMode != PINNED || bDontUseGPUTiming)// 不使用页锁定内存时只能使用 sdkGetTimerValue() 来计时
elapsedTime = sdkGetTimerValue(&timer); sdkDeleteTimer(&timer);
cudaEventDestroy(start);
cudaEventDestroy(stop);
if (memMode == PINNED)
cudaFreeHost(h_data);
else
free(h_data);
cudaFree(d_data);
return ((float)( << ) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)( << ));
} float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
{
float elapsedTime = 0.0f;
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop); unsigned char *h_data, *d_data,*h_cacheClear1,*h_cacheClear2;
if (PINNED == memMode)
{
if(CUDART_VERSION >= )
cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : );
else
cudaMallocHost((void **)&h_data, memSize);
}
else
{
h_data = (unsigned char *)malloc(memSize);
if (h_data == NULL)
{
fprintf(stderr, "\nNo host memory to run testHostToDeviceTransfer\n");
exit(EXIT_FAILURE);
}
}
for (unsigned int i = ; i < memSize / sizeof(unsigned char); i++)
h_data[i] = (unsigned char)(i & 0xff);
h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);// 占位内存?
h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
if (h_cacheClear1 == NULL || h_cacheClear2 == NULL)
{
fprintf(stderr, "\nNo host memory to run testHostToDeviceTransfer\n");
exit(EXIT_FAILURE);
}
for (unsigned int i = ; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++)
{
h_cacheClear1[i] = (unsigned char)(i & 0xff);
h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
}
cudaMalloc((void **)&d_data, memSize); sdkStartTimer(&timer);
cudaEventRecord(start, );
if (memMode == PINNED)
{
for (unsigned int i = ; i < MEMCOPY_ITERATIONS; i++)
cudaMemcpyAsync(d_data, h_data, memSize,cudaMemcpyHostToDevice, );
}
else
{
for (unsigned int i = ; i < MEMCOPY_ITERATIONS; i++)
cudaMemcpy(d_data, h_data, memSize,cudaMemcpyHostToDevice);
}
cudaEventRecord(stop, );
cudaDeviceSynchronize();
sdkStopTimer(&timer);
cudaEventElapsedTime(&elapsedTime, start, stop);
if (memMode != PINNED || bDontUseGPUTiming)
elapsedTime = sdkGetTimerValue(&timer); sdkDeleteTimer(&timer);
cudaEventDestroy(start);
cudaEventDestroy(stop);
if (PINNED == memMode)
cudaFreeHost(h_data);
else
free(h_data);
free(h_cacheClear1);
free(h_cacheClear2);
cudaFree(d_data);
return ((float)( << ) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)( << ));;
} float testDeviceToDeviceTransfer(unsigned int memSize)
{
float elapsedTime = 0.0f;
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop); unsigned char *h_data, *d_idata, *d_odata;
h_data = (unsigned char *)malloc(memSize);
if (h_data == )
{
fprintf(stderr, "\nNo host memory to run testDeviceToDeviceTransfer\n");
exit(EXIT_FAILURE);
}
cudaMalloc((void **)&d_idata, memSize);
cudaMalloc((void **)&d_odata, memSize);
for (unsigned int i = ; i < memSize / sizeof(unsigned char); i++)
h_data[i] = (unsigned char)(i & 0xff);
cudaMemcpy(d_idata, h_data, memSize, cudaMemcpyHostToDevice); sdkStartTimer(&timer);
cudaEventRecord(start, );
for (unsigned int i = ; i < MEMCOPY_ITERATIONS; i++)
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice);
cudaEventRecord(stop, );
cudaDeviceSynchronize();
sdkStopTimer(&timer);
cudaEventElapsedTime(&elapsedTime, start, stop);
if (bDontUseGPUTiming)
elapsedTime = sdkGetTimerValue(&timer); sdkDeleteTimer(&timer);
cudaEventDestroy(stop);
cudaEventDestroy(start);
free(h_data);
cudaFree(d_idata);
cudaFree(d_odata);
return 2.0f * ((float)( << ) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)( << ));
} void testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
{
int i;
unsigned int count = + ((end - start) / increment);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
for (i = ; i < count; i++)
bandwidths[i] = 0.0;
// 逐设备测试
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)
{
cudaSetDevice(currentDevice);
for (i = ; i < count; i++)
{
memSizes[i] = start + i * increment;
switch (kind)
{
case DEVICE_TO_HOST:
bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
break;
}
}
} if (printmode == CSV)
printResultsCSV(memSizes, bandwidths, count, kind, memMode, ( + endDevice - startDevice), wc);
else
printResultsReadable(memSizes, bandwidths, count, kind, memMode, ( + endDevice - startDevice));
free(memSizes);
free(bandwidths);
return;
} void testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
{
int i;
unsigned int count = + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
+ ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
+ ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
+ ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
+ ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
+ ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
+ ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
for (i = ; i < count; i++)
bandwidths[i] = 0.0;
// 逐设备测试
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)
{
printf("\n");
cudaSetDevice(currentDevice);
for (unsigned int memSize = , i = ; memSize <= SHMOO_MEMSIZE_MAX; i++)
{
if (memSize < SHMOO_LIMIT_20KB)
memSize += SHMOO_INCREMENT_1KB;
else if (memSize < SHMOO_LIMIT_50KB)
memSize += SHMOO_INCREMENT_2KB;
else if (memSize < SHMOO_LIMIT_100KB)
memSize += SHMOO_INCREMENT_10KB;
else if (memSize < SHMOO_LIMIT_1MB)
memSize += SHMOO_INCREMENT_100KB;
else if (memSize < SHMOO_LIMIT_16MB)
memSize += SHMOO_INCREMENT_1MB;
else if (memSize < SHMOO_LIMIT_32MB)
memSize += SHMOO_INCREMENT_2MB;
else
memSize += SHMOO_INCREMENT_4MB;
memSizes[i] = memSize;
switch (kind)
{
case DEVICE_TO_HOST:
bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
break;
}
printf(".");
}
} if (CSV == printmode)
printResultsCSV(memSizes, bandwidths, count, kind, memMode, ( + endDevice - startDevice), wc);
else
printResultsReadable(memSizes, bandwidths, count, kind, memMode, ( + endDevice - startDevice));
free(memSizes);
free(bandwidths);
return;
} void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
{
switch (mode)
{
case QUICK_MODE:
testBandwidthRange(DEFAULT_SIZE, DEFAULT_SIZE, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
break;
case RANGE_MODE:
testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
break;
case SHMOO_MODE:
testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
break;
default:
break;
}
} bool test(const int argc, const char **argv)
{
// 处理命令行参数
// 帮助模式,计时器,输出方式
if (checkCmdLineFlag(argc, argv, "help"))
{
printHelp();
return ;
}
if (checkCmdLineFlag(argc, argv, "cputiming"))
bDontUseGPUTiming = true;
printMode printmode = USER_READABLE;
if (checkCmdLineFlag(argc, argv, "csv"))
printmode = CSV; // 内存模式,默认使用页锁定内存
memoryMode memMode = PINNED;
char *memModeStr;
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr))
{
if (strcmp(memModeStr, "pageable") == )
memMode = PAGEABLE;
else if (strcmp(memModeStr, "pinned") == )
memMode = PINNED;
else
{
printf("\nInvalid memory mode - valid modes are pageable or pinned\n");
printf("\nSee --help for more information\n");
return false;
}
}
else
memMode = PINNED;
// 计算能力 2.2 以上,可选 cudaHostAllocWriteCombined 模式
bool wc = (CUDART_VERSION >= && checkCmdLineFlag(argc, argv, "wc")) ? true : false; // 设备
int startDevice = , endDevice = ;
char *device;
if (getCmdLineArgumentString(argc, argv, "device", &device))
{
int deviceCount;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess)
{
printf("\ncudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
exit(EXIT_FAILURE);
}
if (deviceCount == )
{
printf("\nNo devices found\n");
return false;
}
if (strcmp(device, "all") == )
{
printf("\nCumulative Bandwidth to be computed from all the devices\n");
startDevice = ;
endDevice = deviceCount - ;
}
else
{
startDevice = endDevice = atoi(device);
if (startDevice > deviceCount || startDevice < )
{
printf("\nInvalid GPU number %d given hence default gpu 0 will be used\n", startDevice);
startDevice = endDevice = ;
}
}
}
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)// 初始化设备
{
cudaDeviceProp deviceProp;
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
if (error_id == cudaSuccess)
{
printf(" Device %d: %s\n", currentDevice, deviceProp.name);
if (deviceProp.computeMode == cudaComputeModeProhibited)
{
fprintf(stderr, "\nError: device is running in <Compute Mode Prohibited>\n");
cudaSetDevice(currentDevice);
exit(EXIT_FAILURE);
}
}
else
{
printf("\ncudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
cudaSetDevice(currentDevice);
exit(EXIT_FAILURE);
}
} // 测试内容
bool htod = checkCmdLineFlag(argc, argv, "htod") ? true : false;
bool dtoh = checkCmdLineFlag(argc, argv, "dtoh") ? true : false;
bool dtod = checkCmdLineFlag(argc, argv, "dtod") ? true : false;
// 测试模式,默认 quick 模式
testMode mode;
char *modeStr;
if (getCmdLineArgumentString(argc, argv, "mode", &modeStr))
{
if (strcmp(modeStr, "quick") == )
mode = QUICK_MODE;
else if (strcmp(modeStr, "shmoo") == )
mode = SHMOO_MODE;
else if (strcmp(modeStr, "range") == )
mode = RANGE_MODE;
else
{
printf("\nInvalid mode - valid modes are quick, range, or shmoo\n");
return false;
}
}
else
mode = QUICK_MODE;
int startSize, endSize, increment;
if (mode == RANGE_MODE)// range 模式需要给出最小和最大尺寸
{
if (checkCmdLineFlag(argc, (const char **)argv, "startSize"))
{
if ((startSize = getCmdLineArgumentInt(argc, argv, "startSize")) <= )
{
printf("\nIllegal argument - startSize must be greater than zero\n");
return false;
}
}
else
{
printf("\nMust specify a starting size in range mode\n");
return false;
}
if (checkCmdLineFlag(argc, (const char **)argv, "endSize"))
{
if ((endSize = getCmdLineArgumentInt(argc, argv, "endSize")) <= )
{
printf("\nIllegal argument - endSize must be greater than zero\n");
return false;
}
if (startSize > endSize)
{
printf("\nIllegal argument - startSize is greater than endSize\n");
return false;
}
}
else
{
printf("\nMust specify an endSize size in range mode.\n");
return false;
}
if (checkCmdLineFlag(argc, argv, "increment"))
{
if ((increment = getCmdLineArgumentInt(argc, argv, "increment")) <= )
{
printf("\nIllegal argument - increment must be greater than zero\n");
return false;
}
}
else
{
printf("\nMust specify an increment in user mode\n");
return false;
}
}
else
startSize = endSize = increment = DEFAULT_SIZE; #ifdef DEBUG
htod = dtoh = dtod = true;
mode = SHMOO_MODE;
wc = true;
#endif
// 运行测试
printf(" %s, %s cudaHostAllocWriteCombined", (mode == QUICK_MODE) ? "QUICK_MODE" : ((mode == RANGE_MODE) ? "RANGE_MODE" : "SHMOO_MODE"), wc ? "enable" : "disable");
if (htod)
testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, HOST_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
if (dtoh)
testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, DEVICE_TO_HOST, printmode, memMode, startDevice, endDevice, wc);
if (dtod)
testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
for (int nDevice = startDevice; nDevice <= endDevice; nDevice++)// 多设备情况下需要逐一设备同步
cudaSetDevice(nDevice);
return true;
} int main(int argc, char **argv)
{
printf("Start.\n");
printf("Finish: %s.\n", test(argc, (const char **)argv)? "Result = PASS" : "Result = Fail");
getchar();
return ;
}

▶ 输出结果:QUICK_MODE,是否写入合并都尝试了。

Start.
Device : GeForce GTX
QUICK_MODE, enable cudaHostAllocWriteCombined
Host to Device Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
12014.1 Device to Host Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
12780.3 Device to Device Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
185140.2 Finish: Result = PASS.
Start.
Device : GeForce GTX
QUICK_MODE, disable cudaHostAllocWriteCombined
Host to Device Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
12369.4 Device to Host Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
12741.7 Device to Device Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
185244.1 Finish: Result = PASS.
Start.
Device : GeForce GTX
QUICK_MODE, disenable cudaHostAllocWriteCombined // 使用老版本的函数 cudaMallocHost()
Host to Device Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
12205.4 Device to Host Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
12814.8 Device to Device Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
187538.8
Finish: Result = PASS.

▶ 输出结果:SHMOO_MODE(RANGE_MODE 模式需要给 .exe 额外参数,这里没有测试)

Start.
Device : GeForce GTX
SHMOO_MODE, enable cudaHostAllocWriteCombined
.................................................................................
Host to Device Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
728.8
1319.7
1797.7
2441.4
2921.7
3386.9
3776.8
4027.1
4125.7
4672.5
5063.3
5302.6
5568.9
5814.7
6100.3
6056.2
6459.0
6734.9
6510.4
6950.6
7209.5
7585.0
7812.5
7948.8
8048.6
8137.4
8599.8
8616.7
8899.1
9084.3
9152.1
9276.1
9319.9
9356.9
9536.7
10097.5
10357.5
10209.5
10612.0
10923.5
11632.7
12065.2
12157.0
12240.7
12227.5
12314.8
12379.2
12349.4
12126.7
12405.8
12254.6
12200.2
12343.5
12267.2
12405.5
12360.6
12376.7
12435.5
12441.0
12390.6
12354.7
12560.6
12540.4
12414.0
12387.1
12436.0
12455.3
12458.1
12438.6
12594.3
12459.4
12468.8
12464.1
12477.6
12609.3
12509.2
12511.1
12510.9
12544.5
12536.1
12589.3 .................................................................................
Device to Host Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
1190.9
2164.1
2989.5
3792.5
4398.9
4882.8
5340.6
5908.8
6189.5
6341.3
6975.4
7233.8
7381.0
7724.2
7744.1
8132.7
8178.1
8410.6
8433.9
8719.3
9027.0
9155.3
9474.1
9205.7
9390.0
9701.2
9911.4
10044.6
10271.6
10444.5
10456.5
10480.2
10645.0
10698.2
10847.7
11068.5
11336.5
11522.3
11564.6
11765.8
12276.1
12477.4
12548.2
12626.9
12685.4
12680.3
12572.4
12735.9
12746.6
12612.6
12590.1
12686.1
12613.7
12698.9
12744.6
12671.2
12725.5
12656.7
12748.5
12679.3
12633.1
12699.6
12742.6
12743.1
12778.5
12853.2
12720.3
12687.7
12723.4
12727.3
12735.1
12781.9
12728.6
12698.2
12741.9
12740.8
12727.9
12732.4
12732.0
12746.5
12721.2 .................................................................................
Device to Device Bandwidth, Device(s), PINNED Memory Transfers
Transfer Size: Bytes Bandwidth: MB/s
996.5
1606.9
2381.9
3112.5
3875.2
4455.8
5314.0
5986.6
6709.2
7414.0
8016.6
8647.5
9300.6
9943.2
9765.6
13329.8
13697.3
6595.5
16327.5
16054.5
17978.6
19574.6
20897.6
22786.5
24414.1
25933.6
27669.3
29377.2
30542.7
33235.8
35511.4
35319.8
37231.3
39635.4
40521.3
39324.7
69499.6
70531.8
77779.3
85277.7
162211.3
204873.3
258718.8
274195.0
309201.8
261475.0
362004.1
391685.8
362360.9
155871.0
174356.8
179414.9
182220.3
170005.7
179022.1
177974.8
177474.9
182872.5
187523.3
184357.2
187779.5
188050.6
188358.7
188660.6
189137.7
189415.0
182262.0
183092.1
184434.2
183828.0
185503.0
184717.2
186353.3
185746.6
184612.9
185253.0
185704.4
186218.8
190486.9
190930.6
185773.8
Finish: Result = PASS.

▶ 涨姿势

● 申请全局内存时使用新版本的函数 cudaHostAlloc() 与老版本的函数 cudaMallocHost() 性能没有明显差距。

● 申请全局内存时是否指明合并写入标志,性能没有明显差距。

● 对不同大小的内存随便进行拷贝,可以发现带宽随碎片大小的增大而增大,并逐渐趋于稳定。设备之间的内存拷贝比设备与主机之间的拷贝速度高一个量级。

● 从主机内存向设备进行内存拷贝的时候使用了占位内存 h_cacheClear1 和 h_cacheClear2(共 32M 大小),这是为了消除主机内存缓存对内存拷贝的加速作用。

1_Utilities__bandwidthTest的更多相关文章

随机推荐

  1. CF1119 Global Round 2

    CF1119A Ilya and a Colorful Walk 这题二分是假的.. \(1,2,1,2,1\) 有间隔为 \(3\) 的,但没有间隔为 \(2\) 的.开始被 \(hack\) 了一 ...

  2. 一个不错的nomad raw_exec && docker 运行例子(集成访问网关)

    github 上有一个关于nomad up && runing 不错的项目,包含了一个tomcat 应用的部署,以及基于容器运行的网关服务 项目参考地址 https://github. ...

  3. 详解SID之终结篇

    今天测试某款监控软件时遇到一个比较棘手的问题,这款软件需要在被监控端安装客户端程序.成功在第一个节点安装好客户端后问题出现了,在其他节点安装时报错无法安装.软件报的错误信息无从下手且系统日志也看不出什 ...

  4. 按的第一个greasemonkey插件:评论时可以粘贴啦~~

    原来的样子:如果按ctrl+V会跳出错误

  5. md5,base64,rsa

    MD5功能:    输入任意长度的信息,经过处理,输出为128位的信息(数字指纹):    不同的输入得到的不同的结果(唯一性):    根据128位的输出结果不可能反推出输入的信息(不可逆): 1. ...

  6. centos7 MFS drbd keepalived

    环境: centos7.3 + moosefs 3.0.97 + drbd84-utils-8.9.8-1 + keepalived-1.2.13-9 工作原理: 架构图: 节点信息: 节点名     ...

  7. 修改Nginx的header伪装服务器

    有时候为了伪装自己的真实服务器环境.不像让对方知道自己的webserver真实环境,就不得不修改我们的webserer软件了!今天看了一下baidu.com的webserver感觉像是nginx修改的 ...

  8. bzoj 4566 [Haoi2016]找相同字符——广义后缀自动机

    题目:https://www.lydsy.com/JudgeOnline/problem.php?id=4566 每个后缀结尾处 ct[ ] = 1 ,按拓扑序 dp 一下就能求出 right 集合的 ...

  9. WIN7\win10下使用批处理配置JAVA环境变量

    我找了很多环境变量批处理的教程,都不太满意,因此综合修改了下,拼凑出了这么一个版本. 下面这个是我主要参考的博客,大部分的代码都是来自这里: http://blog.csdn.net/lpy36543 ...

  10. mySQL 教程 第5章 插入 更新与删除数据

    使用SQL Manager管理工具连接到schoolDB.由于三张表都设置了主键,因此,以下练习中插入的记录,主键不能重. 插入数据 1. 练习:为表的所有字段插入数据 为表中所有字段插入数据,可以不 ...