不合理的代码

 /*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/ #include "../common/book.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20) __global__ void kernel(int *a, int *b, int *c) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + ) % ;
int idx2 = (idx + ) % ;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / ;
}
} int main(void) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR(cudaGetDevice(&whichDevice));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
if (!prop.deviceOverlap) {
printf("Device will not handle overlaps, so no speed up from streams\n");
return ;
} cudaEvent_t start, stop;
float elapsedTime; cudaStream_t stream0, stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1; // start the timers
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop)); // initialize the streams
HANDLE_ERROR(cudaStreamCreate(&stream0));
HANDLE_ERROR(cudaStreamCreate(&stream1)); // allocate the memory on the GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
N * sizeof(int))); // allocate host locked memory, used to stream
HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault)); for (int i = ; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
} HANDLE_ERROR(cudaEventRecord(start, ));
// now loop over full data, in bite-sized chunks
for (int i = ; i<FULL_DATA_SIZE; i += N * ) {
// copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0)); kernel << <N / , , , stream0 >> >(dev_a0, dev_b0, dev_c0); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0)); // copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1));
HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1)); kernel << <N / , , , stream1 >> >(dev_a1, dev_b1, dev_c1); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream1));
}
HANDLE_ERROR(cudaStreamSynchronize(stream0));
HANDLE_ERROR(cudaStreamSynchronize(stream1)); HANDLE_ERROR(cudaEventRecord(stop, )); HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
start, stop));
printf("Time taken: %3.1f ms\n", elapsedTime); // cleanup the streams and memory
HANDLE_ERROR(cudaFreeHost(host_a));
HANDLE_ERROR(cudaFreeHost(host_b));
HANDLE_ERROR(cudaFreeHost(host_c));
HANDLE_ERROR(cudaFree(dev_a0));
HANDLE_ERROR(cudaFree(dev_b0));
HANDLE_ERROR(cudaFree(dev_c0));
HANDLE_ERROR(cudaFree(dev_a1));
HANDLE_ERROR(cudaFree(dev_b1));
HANDLE_ERROR(cudaFree(dev_c1));
HANDLE_ERROR(cudaStreamDestroy(stream0));
HANDLE_ERROR(cudaStreamDestroy(stream1)); return ;
}

代码下载

basic_double_stream_incorrect的更多相关文章

随机推荐

  1. django进阶之缓存

    Model 到目前为止,当我们的程序涉及到数据库相关操作时,我们一般都会这么搞: 创建数据库,设计表结构和字段 使用 MySQLdb 来连接数据库,并编写数据访问层代码 业务逻辑层去调用数据访问层执行 ...

  2. Flutter SDK的下载与安装步骤 (mac版)

    本月初(应该是2018年12月4日),Google在其Flutter Live 2018大会上正式发布 Flutter 1.0 版本. 当然我们不会怀疑Google团队的技术实力,但它和React N ...

  3. PHP与thinkphp中var_dump()打印数组显示不全问题

    在我们进行php开发的时候,经常会使用var_dump()函数进行数组的打印,以方便我们程序的调试,而有时候我们在进行多维数组打印的时候会发现多维数组打印不全,有些地方被…代替,这就是我们php配置的 ...

  4. Sublime Text 3 多行游标

    选中要修改的地方ctrl+D ,要跳过不需要修改的选中的就用ctrl+k+d 选中要修改的地方ctrl+D,选中所有要修改的 alt+f3 ctrl+A  ,然后ctrl+shift+L 按住shif ...

  5. List<Object> 使用Linq

    List<Asset> bdList = allAsset.Where(m => m.Owner.Depts == view.DeptName).ToList(); var quer ...

  6. Scala_Load csv data to hive via spark2.1

    code: package com.liupu import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark. ...

  7. jquery——下载、使用

    jQuery是目前使用最广泛的javascript函数库. 怎样安装? 这是下载地址:https://code.jquery.com/ minified是压缩版的 新建一个网页打开上面这个网址,ctr ...

  8. SGU - 409

    题目链接:https://vjudge.net/contest/239445#problem/H 题目大意:输入n,k,有n*n* n*n的网格,要求每行每列刚好有k个*,每n*n的小方格内也刚好有k ...

  9. asp.net mvc网站的发布与IIS配置

    一.安装IIS (如果服务器上已经安装了就跳过) 控制面板——程序——程序功能——打开或关闭windows功能——勾选Inertnet信息服务下面所有选项——确定 二.获取发布文件(如果已经发不好就跳 ...

  10. sql server sql语句

    添加联合唯一索引 create unique index 索引名 on 表名(列名1,列名2……)