basic_double_stream_incorrect
不合理的代码
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/ #include "../common/book.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20) __global__ void kernel(int *a, int *b, int *c) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + ) % ;
int idx2 = (idx + ) % ;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / ;
}
} int main(void) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR(cudaGetDevice(&whichDevice));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
if (!prop.deviceOverlap) {
printf("Device will not handle overlaps, so no speed up from streams\n");
return ;
} cudaEvent_t start, stop;
float elapsedTime; cudaStream_t stream0, stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1; // start the timers
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop)); // initialize the streams
HANDLE_ERROR(cudaStreamCreate(&stream0));
HANDLE_ERROR(cudaStreamCreate(&stream1)); // allocate the memory on the GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
N * sizeof(int))); // allocate host locked memory, used to stream
HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault)); for (int i = ; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
} HANDLE_ERROR(cudaEventRecord(start, ));
// now loop over full data, in bite-sized chunks
for (int i = ; i<FULL_DATA_SIZE; i += N * ) {
// copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0)); kernel << <N / , , , stream0 >> >(dev_a0, dev_b0, dev_c0); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0)); // copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1));
HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1)); kernel << <N / , , , stream1 >> >(dev_a1, dev_b1, dev_c1); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream1));
}
HANDLE_ERROR(cudaStreamSynchronize(stream0));
HANDLE_ERROR(cudaStreamSynchronize(stream1)); HANDLE_ERROR(cudaEventRecord(stop, )); HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
start, stop));
printf("Time taken: %3.1f ms\n", elapsedTime); // cleanup the streams and memory
HANDLE_ERROR(cudaFreeHost(host_a));
HANDLE_ERROR(cudaFreeHost(host_b));
HANDLE_ERROR(cudaFreeHost(host_c));
HANDLE_ERROR(cudaFree(dev_a0));
HANDLE_ERROR(cudaFree(dev_b0));
HANDLE_ERROR(cudaFree(dev_c0));
HANDLE_ERROR(cudaFree(dev_a1));
HANDLE_ERROR(cudaFree(dev_b1));
HANDLE_ERROR(cudaFree(dev_c1));
HANDLE_ERROR(cudaStreamDestroy(stream0));
HANDLE_ERROR(cudaStreamDestroy(stream1)); return ;
}
代码下载
basic_double_stream_incorrect的更多相关文章
随机推荐
- IOS UIWebView与js的简单交互swift3版
在开发过程中,我们可能遇到ios代码与js交互的情况,本人第一次使用遇到了很多坑,这里纪录一下,方便自己,也方便需要的人. 1.第一步先建一个接口(协议)并继承JSExport 这里实现两个方法提供给 ...
- iOS sqlite
iOS sqlite数据库操作.步骤是: 先加入sqlite开发库libsqlite3.dylib, 新建或打开数据库, 创建数据表, 插入数据, 查询数据并打印 1.新建项目sqliteDemo,添 ...
- thinkphp5引入百度编辑器
在ThinkPHP的模板(html文件)中引入Ueditor 下载ueditor解压至public/static目录 在需要的页面引入js文件 <script type="text/ ...
- Flask&&人工智能AI --2
参考博客: https://www.cnblogs.com/xiao987334176/p/9598606.html 昨日作业讲解 昨天的作业就是,有3个视图函数,分别是/login,/student ...
- 简述raid0,raid1,raid5,raid10 的工作原理及特点
RAID 0 支持1块盘到多块盘,容量是所有盘之和 RAID1 只支持2块盘,容量损失一块盘 RAID 5最少三块盘,不管硬盘数量多少,只损失一块容量 RAID 10最少4块盘,必须偶数硬盘,不管硬盘 ...
- 判断文件是否存在 local/hdfs
在Linux文件系统中,我们可以使用下面的Shell脚本判断某个文件是否存在: # 这里的-f参数判断$file是否存在 if [ ! -f "$file" ]; then ech ...
- Jmeter4.0----正则表达式提取器(12)
1.说明 有时候需要将前一个请求返回的数据作为下一个请求的参数时,我们就需要正则表达式提取器.使用正则表达式提取器去提取我们想要的部分再传入后面的请求中. (之前看了一篇不错的博客,可以参考学习 ht ...
- Pandas处理数据常用方法
# -*- coding: utf-8 -*-import pandas as pd"""(1)利用pandas读取csv文件"""def ...
- linux下.exe文件的安装与使用
1安装wine 2 安装exe软件:下载应用软件.exe,然后点击右键用wine打开/或者在终端中wine 应用软件.exe即能安装. 卸载exe可以右键点击安装软件中uninstall.exe-用w ...
- (转)Mysql数据库之Binlog日志使用总结Linux下用户组、文件权限详解
Linux下用户组.文件权限详解 原文:http://blog.csdn.net/sdulibh/article/details/51566772 用户组 在linux中的每个用户必须属于一个组,不能 ...