cuda编程并行归约

cuda编程并行归约

AtomicAdd调用出错

在cuda中调用atomicAdd函数，但总显示未定义标识符，在网上送了一下，于是做了如下修改，

右键解决方案属性-》配置属性-》CUDA C/C++-》Device-》Code Generation，加入compute_20,sm_20，并且把下面的“从父级或项目属性默认设置继承”的勾选去掉

gpu cpu下时间计算

//cpu 下

#include <time.h>

clock_t start,end;

start  = clock();

//cpu codes

end = clock();

printf("CPU Time: %.5f\n", (float)(end-start));

//gpu 下

cudaEvent_t st,ed;

cudaEventCreate(&st);

cudaEventCreate(&ed);

cudaEventRecord(st,0);

//gpu codes

cudaEventRecord(ed,0);

cudaEventSynchronize(ed);

float gpu_time;

cudaEventElapsedTime(&gpu_time,st,ed);

printf("GPU Time: %.5f\n",gpu_time);

cudaEventDestroy(st);

cudaEventDestroy(ed);

加法的归约

#include <stdlib.h>

#include <stdio.h>

#include <cuda.h>

#include <device_launch_parameters.h>

#include <cuda_runtime.h>

#include <book.h>

const int Size = 256;

const int block = 8;

const int thread = 32;

__global__ void calc(float *in, float *out){

	unsigned int tid = threadIdx.x;

	unsigned int bid = blockIdx.x;

	//target array

	float * target = in + blockIdx.x * blockDim.x;

	//bounding

	if(tid > thread)

		return;

	for(int stride = 1 ; stride < blockDim.x ; stride *= 2)

	{

		if(tid % (stride*2) == 0)

		{

			target[tid] += target[tid+stride];

		}

		__syncthreads();

	}

	if(tid == 0)

	{

		out[blockIdx.x] = target[tid];

	}

}

__global__ void calc2(float *in, float *out)

{

	unsigned int tid = threadIdx.x;

	unsigned int bid = tid + blockIdx.x*blockDim.x;

	float * target = in + blockIdx.x * blockDim.x;

	//bounding

	if(tid > thread)

		return;

	//stride = 1,2,4,8

	for(int stride = 1 ; stride < blockDim.x ; stride *= 2)

	{

		unsigned int index = 2*stride*tid;

		if(index < blockDim.x)

			target[index] += target[index+stride];

		__syncthreads();

	}

	if(tid == 0)

	{

		out[blockIdx.x] = target[tid];

	}

}

//跨步规约

__global__ void calc3(float *in, float *out)

{

	unsigned int tid = threadIdx.x;

	unsigned int bid = tid + blockIdx.x*blockDim.x;

	float * target = in + blockIdx.x * blockDim.x;

	//bounding

	if(tid > thread)

		return;

	for(int stride = blockDim.x/2 ; stride > 0 ; stride /=2)

	{

		if(tid < stride)

			target[tid] += target[tid+stride];

		__syncthreads();

	}

	if(tid == 0)

	{

		out[blockIdx.x] = target[tid];

	}

} 

__global__ void calc4(float *in, float *out)

{

	int tid = threadIdx.x;

	int bid = blockIdx.x;

	float * target=in + bid * blockDim.x;

	if(tid < thread)

		return;

	__shared__ float share_in[thread];

	share_in[tid] = target[tid];

	__syncthreads();

	for(int stride = blockDim.x/2 ; stride > 0; stride /= 2)

	{

		if(tid < stride)

		{

			share_in[tid] += share_in[tid+stride];

		}

		__syncthreads();

	}

	if(tid == 0)

	{

		out[blockIdx.x] = share_in[tid];

	}

}

int main()

{

	//host

	float * indata; // Size

	float * outdata; // block

	float * ans; // 1

	// device

	float * dev_indata; // Size

	float * dev_outdata; // block

	// host malloc

	indata = (float*)malloc(sizeof(float)*Size);

	outdata = (float*)malloc(sizeof(float)*block);

	ans = (float*)malloc(sizeof(float));

	// device malloc

	cudaMalloc((void**)&dev_indata,sizeof(float)*Size);

	cudaMalloc((void**)&dev_outdata,sizeof(float)*block);

	// init & generate data

	for(int i = 0 ; i < Size ; i++)

	{

		indata[i] = i;

	}

	*ans = 0;

	// time start

	cudaEvent_t st,ed;

	cudaEventCreate(&st);

	cudaEventCreate(&ed);

	cudaEventRecord(st,0);

	// memcpy to device

	HANDLE_ERROR(cudaMemcpy(dev_indata,indata,sizeof(float)*Size,cudaMemcpyHostToDevice));

	// kernal functions

	cudaDeviceSynchronize();

	calc4<<<block,thread>>>(dev_indata,dev_outdata);

	cudaDeviceSynchronize();

	// memcpy to host

	HANDLE_ERROR(cudaMemcpy(outdata,dev_outdata,sizeof(float)*block,cudaMemcpyDeviceToHost));

	// time end

	cudaEventRecord(ed,0);

	cudaEventSynchronize(ed);

	float gpu_time;

	cudaEventElapsedTime(&gpu_time,st,ed);

	// test output

	for(int i = 0 ; i < block ; i++)

	{

		//printf("%.3f\n",outdata[i]);

		*ans += outdata[i];

	}

	printf("GPU Time: %.5f\nAns: %.5f\n",gpu_time,*ans);

	//time destory

	cudaEventDestroy(st);

	cudaEventDestroy(ed);

	//device destory

	cudaFree(indata);

	cudaFree(outdata);

	cudaFree(ans);

	getchar();

	return 0;

}

矩阵乘法

#include <stdlib.h>

#include <cuda_runtime.h>

#include <stdio.h>

#include <cuda.h>

#include <device_launch_parameters.h>

const int N = 20;

__global__ void mul(int *a,int* b,int *out)

{

  unsigned int tidx = threadIdx.x;

  unsigned int tidy = threadIdx.y;

  unsigned int offset = tidx*N + tidy;

  if(offset > N*N)return;

  int t = 0;

  for(int i = 0 ; i < N ; i++)

  {

    t += a[tidx*N+i]*b[i*N+tidy];

  }

  out[offset] = t;

}

int main()

{

  //host

  int * matrix1;

  int * matrix2;

  int * output;

  //device

  int * dev_matrix1;

  int * dev_matrix2;

  int * dev_output;

  //host malloc

  matrix1 = (int*)malloc(sizeof(int)*N*N);

  matrix2 = (int*)malloc(sizeof(int)*N*N);

  output = (int*)malloc(sizeof(int)*N*N);

  //device malloc

  cudaMalloc((void**)&dev_matrix1,sizeof(int)*N*N);

  cudaMalloc((void**)&dev_matrix2,sizeof(int)*N*N);

  cudaMalloc((void**)&dev_output,sizeof(int)*N*N);

  //init generate data

  for(int i = 0 ; i < N*N ; i++)

  {

    matrix1[i] = i+1;

    matrix2[i] = i+1;

    output[i] = 0;

  }

  //CPU

  for(int i = 0 ; i < N ; i++)

  {

    for(int j = 0 ; j < N ; j++){

      int tp = 0;

      for(int k = 0 ; k < N ; k++)

      {

        tp += matrix1[i*N+k] * matrix2[k*N+j];

      }

      printf("%d ",tp);

    }

  }

  printf("\n----------\n");

  //time start

  cudaEvent_t st,ed;

  cudaEventCreate(&st);

  cudaEventCreate(&ed);

  cudaEventRecord(st,0);

  //memcpy to device

  cudaMemcpy(dev_matrix1,matrix1,sizeof(int)*N*N,cudaMemcpyHostToDevice);

  cudaMemcpy(dev_matrix2,matrix2,sizeof(int)*N*N,cudaMemcpyHostToDevice);

  //kernel functions

  mul<<<2,dim3(N,N)>>>(dev_matrix1,dev_matrix2,dev_output);

  //memcpy to host

  cudaMemcpy(output,dev_output,sizeof(int)*N*N,cudaMemcpyDeviceToHost);

  //output

  for(int i = 0 ; i < N*N ; i++)

  {

    printf("%d ",output[i]);

  }

  printf("\n");

  //time end

  cudaEventRecord(ed,0);

  cudaEventSynchronize(ed);

  float gpu_time;

  cudaEventElapsedTime(&gpu_time,st,ed);

  printf("gpu time: %.5f\n",gpu_time);

  //time destory

  cudaEventDestroy(st);

  cudaEventDestroy(ed);

  //device destory

  cudaFree(dev_matrix1);

  cudaFree(dev_matrix2);

  cudaFree(dev_output);

  free(matrix1);

  free(matrix2);

  free(output);

  return 0;

}

矩阵转置

#include <iostream>

#include <stdlib.h>

#include <stdio.h>

#include "cuda.h"

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

const int N = 5;

void output(int * arr)

{

  for(int i = 0 ; i < N*N ; i++)

  {

    printf("%d\t",arr[i]);

    if((i+1) % N == 0)

      printf("\n");

  }

  printf("\n");

}

__global__ void trans(int * in, int * out)

{

  unsigned int xIndex = threadIdx.x + blockDim.x * blockIdx.x;

  unsigned int yIndex = threadIdx.y + blockDim.y * blockIdx.y;

  if(xIndex < N && yIndex < N)

  {

    unsigned int index_in = xIndex + N * yIndex;

    unsigned int index_out = yIndex + N * xIndex;

    out[index_out] = in[index_in];

  }

}

__global__ void trans2(int * in , int * out)

{

  __shared__ float block[N][N];

  unsigned int xIndex = blockIdx.x * N + threadIdx.x;

  unsigned int yIndex = blockIdx.y * N + threadIdx.y;

  if((xIndex < N) && (yIndex < N))

  {

    unsigned int index_in = yIndex * N +xIndex;

    block[threadIdx.x][threadIdx.y] = in[index_in];

  }

  __syncthreads();

  xIndex = blockIdx.y * N + threadIdx.x;

  yIndex = blockIdx.x * N + threadIdx.y;

  if((xIndex < N) && (yIndex < N))

  {

    unsigned int index_out = yIndex * N + xIndex;

    out[index_out] = block[threadIdx.x][threadIdx.y];

  }

}

int main()

{

  //host

  int * in;

  int * out;

  //device

  int * dev_in;

  int * dev_out;

  //host cudaMalloc

  in = (int*)malloc(sizeof(int)*N*N);

  out = (int*)malloc(sizeof(int)*N*N);

  //device cudaMalloc

  cudaMalloc((void**)&dev_in,sizeof(int)*N*N);

  cudaMalloc((void**)&dev_out,sizeof(int)*N*N);

  //init

  for(int i = 0 ; i < N*N ; i++){

    in[i] = i+1;

  }

  //cudaMemcpy

  cudaMemcpy(dev_in,in,sizeof(int)*N*N,cudaMemcpyHostToDevice);

  //kernel functions

  trans<<<1,dim3(N,N)>>>(dev_in,dev_out);

  //memcpy back

  cudaMemcpy(out,dev_out,sizeof(int)*N*N,cudaMemcpyDeviceToHost);

  //dev_output

  output(in);

  printf("\n--------\n");

  output(out);

  //cudaFree

  cudaFree(dev_in);

  cudaFree(dev_out);

  free(in);

  free(out);

  return 0;

}

统计数目

#include <iostream>

#include <stdlib.h>

#include <stdio.h>

#include "cuda.h"

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

const int N = 26;

const int L = 128;

const int S = L*4;

const int block = 4;

const int thread = 32;

__global__ void rec(char* book, int * record)

{

  unsigned int tid = threadIdx.x;

  __shared__ int temp[N];

  temp[tid] = 0;

  __syncthreads();

  int index = tid + blockIdx.x * blockDim.x;

  int offset = blockDim.x * gridDim.x;

  //printf("%d-%d\n",index,offset);

  while(index < S)

  {

    atomicAdd(&(temp[book[index]]),1);

    index += offset;

  }

  __syncthreads();

  atomicAdd(&(record[tid]),temp[tid]);

}

int main()

{

   //host

   char * book;

   int * record;

   //device

   char * dev_book;

   int * dev_record;

   //host cudaMalloc

   book = (char*)malloc(sizeof(char)*S);

   record = (int*)malloc(sizeof(int)*N);

   //device malloc

   cudaMalloc((void**)&dev_book,sizeof(char)*S);

   cudaMalloc((void**)&dev_record,sizeof(int)*N);

   //init

   for(int i = 0 ; i < S ; i++)

   {

      srand(i+rand());

      book[i] = (i+i*i+rand())%26;

   }

   //cpu

   int tp[N]={0};

   for(int i = 0 ; i < S ; i++)

   {

     tp[book[i]]++;

   }

   for(int i = 0 ; i < N ; i++)

      printf("%d ",tp[i]);

   printf("\n");

   //memcpy To device

   cudaMemcpy(dev_book,book,sizeof(char)*S,cudaMemcpyHostToDevice);

   //kernel functions

   rec<<<block,thread>>>(dev_book,dev_record);

   //memcpy To host

   cudaMemcpy(record,dev_record,sizeof(int)*N,cudaMemcpyDeviceToHost);

   //output

   for(int i = 0 ; i < N ; i++)

   {

     printf("%d ",record[i]);

   }

   printf("\n");

   //destory

   cudaFree(dev_book);

   cudaFree(dev_record);

   free(book);

   free(record);

  return 0;

}

平方和求和

分块处理

#include <iostream>

#include <stdlib.h>

#include <stdio.h>

#include "cuda.h"

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <time.h>

/*

*  author : pprp

*  theme : 平方和

*/

const int N = 128;

const int block = 4;

const int thread = 32;

__global__ void calc0(int * arr, int * result)

{

  int tid = threadIdx.x;

  int Size = N / block;

  int sum = 0;

  for(int i = tid * Size ; i <(tid+1)*Size; i++)

  {

    sum += arr[i]*arr[i];

  }

  result[tid] = sum;

  //printf("sum: %d\n",sum);

}

int main()

{

  //host

  int * arr;

  int * result;

  //device

  int * dev_arr;

  int * dev_result;

  //host malloc

  arr = (int*)malloc(sizeof(int)*N);

  result = (int*)malloc(sizeof(int)*block);

  //device malloc

  cudaMalloc((void**)&dev_arr,sizeof(int)*N);

  cudaMalloc((void**)&dev_result,sizeof(int)*block);

  //init

  for(int i = 0 ; i < N ; i++)

  {

    arr[i] = i+1;

    if(i < block)

    {

      result[i] = 0;

    }

  }

  //cpu

  clock_t start,end;

  start = clock();

  unsigned int res = 0;

  for(int i = 0 ; i < N ; i++)

  {

    res += arr[i]*arr[i];

  }

  end = clock();

  printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start));

  //time start

  cudaEvent_t st,ed;

  cudaEventCreate(&st);

  cudaEventCreate(&ed);

  cudaEventRecord(st,0);

  //memcpy To Host

  cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);

  //kernel functions

  calc0<<<1,4>>>(dev_arr,dev_result);

  //memcpy To Device

  cudaMemcpy(result,dev_result,sizeof(int)*block,cudaMemcpyDeviceToHost);

  //output

  int res2=0;

  for(int i = 0 ; i < block ; i++)

  {

    res2 += result[i];

    //printf("test: %d\n",result[i]);

  }

  //time end

  cudaEventRecord(ed,0);

  cudaEventSynchronize(ed);

  float gpu_time;

  cudaEventElapsedTime(&gpu_time,st,ed);

  printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time);

  //time destroy

  cudaEventDestroy(st);

  cudaEventDestroy(ed);

  //device free

  cudaFree(dev_arr);

  cudaFree(dev_result);

  free(arr);

  free(result);

  return 0;

}

线程相邻

#include <iostream>

#include <stdlib.h>

#include <stdio.h>

#include "cuda.h"

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <time.h>

/*

*  author : pprp

*  theme : 平方和

*/

const int N = 128;

const int block = 4;

const int thread = 32;

__global__ void calc0(int * arr, int * result)

{

  int tid = threadIdx.x;

  if(tid > block)return;

  int sum = 0;

  for(int i = tid; i < N ; i+=block)

  {

    sum += arr[i]*arr[i];

  }

  result[tid] = sum;

}

int main()

{

  //host

  int * arr;

  int * result;

  //device

  int * dev_arr;

  int * dev_result;

  //host malloc

  arr = (int*)malloc(sizeof(int)*N);

  result = (int*)malloc(sizeof(int)*block);

  //device malloc

  cudaMalloc((void**)&dev_arr,sizeof(int)*N);

  cudaMalloc((void**)&dev_result,sizeof(int)*block);

  //init

  for(int i = 0 ; i < N ; i++)

  {

    arr[i] = i+1;

    if(i < block)

    {

      result[i] = 0;

    }

  }

  //cpu

  clock_t start,end;

  start = clock();

  unsigned int res = 0;

  for(int i = 0 ; i < N ; i++)

  {

    res += arr[i]*arr[i];

  }

  end = clock();

  printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start));

  //time start

  cudaEvent_t st,ed;

  cudaEventCreate(&st);

  cudaEventCreate(&ed);

  cudaEventRecord(st,0);

  //memcpy To Host

  cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);

  //kernel functions

  calc0<<<1,block>>>(dev_arr,dev_result);

  //memcpy To Device

  cudaMemcpy(result,dev_result,sizeof(int)*block,cudaMemcpyDeviceToHost);

  //output

  int res2=0;

  for(int i = 0 ; i < block ; i++)

  {

    res2 += result[i];

    //printf("test: %d\n",result[i]);

  }

  //time end

  cudaEventRecord(ed,0);

  cudaEventSynchronize(ed);

  float gpu_time;

  cudaEventElapsedTime(&gpu_time,st,ed);

  printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time);

  //time destroy

  cudaEventDestroy(st);

  cudaEventDestroy(ed);

  //device free

  cudaFree(dev_arr);

  cudaFree(dev_result);

  free(arr);

  free(result);

  return 0;

}

多block计算

#include <iostream>

#include <stdlib.h>

#include <stdio.h>

#include "cuda.h"

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <time.h>

/*

*  author : pprp

*  theme : 平方和

*/

const int N = 32;

const int block = 4;

const int thread = 8;

__global__ void calc0(int * arr, int * result)

{

  int tid = threadIdx.x;

  int bid = blockIdx.x;

  int sum = 0;

  for(int i = bid*blockDim.x+tid; i < N ; i += blockDim.x*gridDim.x)

  {

    sum += arr[i]*arr[i];

  }

  __syncthreads();

  result[bid*blockDim.x+tid] = sum;

  printf("++%d \n",sum);

}

int main()

{

  //host

  int * arr;

  int * result;

  //device

  int * dev_arr;

  int * dev_result;

  //host malloc

  arr = (int*)malloc(sizeof(int)*N);

  result = (int*)malloc(sizeof(int)*N);

  //device malloc

  cudaMalloc((void**)&dev_arr,sizeof(int)*N);

  cudaMalloc((void**)&dev_result,sizeof(int)*N);

  //init

  for(int i = 0 ; i < N ; i++)

  {

    arr[i] = i+1;

    if(i < thread)

    {

      result[i] = 0;

    }

  }

  //cpu

  clock_t start,end;

  start = clock();

  unsigned int res = 0;

  for(int i = 0 ; i < N ; i++)

  {

    res += arr[i]*arr[i];

  }

  end = clock();

  printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start));

  //time start

  cudaEvent_t st,ed;

  cudaEventCreate(&st);

  cudaEventCreate(&ed);

  cudaEventRecord(st,0);

  //memcpy To Host

  cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);

  //kernel functions

  calc0<<<block,thread>>>(dev_arr,dev_result);

  //memcpy To Device

  cudaMemcpy(result,dev_result,sizeof(int)*N,cudaMemcpyDeviceToHost);

  //output

  int res2=0;

  for(int i = 0 ; i < N ; i++)

  {

    res2 += result[i];

    //printf("test: %d\n",result[i]);

  }

  //time end

  cudaEventRecord(ed,0);

  cudaEventSynchronize(ed);

  float gpu_time;

  cudaEventElapsedTime(&gpu_time,st,ed);

  printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time);

  //time destroy

  cudaEventDestroy(st);

  cudaEventDestroy(ed);

  //device free

  cudaFree(dev_arr);

  cudaFree(dev_result);

  free(arr);

  free(result);

  return 0;

}

【Cuda编程】加法归约的更多相关文章

CUDA中的归约
CUDA编程实战书中的乘方和解决办法: 对一个数组执行某种计算,然后产生一个更小的结果数组. 由一个线程在共享内存上进行迭代并计算出总和值.而如果用并行,所花时间就与数组长度的对数成正比. 代码的思想 ...
CUDA编程（六）进一步并行
CUDA编程(六) 进一步并行在之前我们使用Thread完毕了简单的并行加速,尽管我们的程序运行速度有了50甚至上百倍的提升,可是依据内存带宽来评估的话我们的程序还远远不够.在上一篇博客中给大家介绍 ...
CUDA编程之快速入门
CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...
CUDA编程（十）使用Kahan's Summation Formula提高精度
CUDA编程(十) 使用Kahan's Summation Formula提高精度上一次我们准备去并行一个矩阵乘法.然后我们在GPU上完毕了这个程序,当然是非常单纯的把任务分配给各个线程.也没有经过 ...
CUDA编程学习笔记1
CUDA编程模型是一个异构模型,需要CPU和GPU协同工作. host和device host和device是两个重要的概念 host指代CPU及其内存 device指代GPU及其内存 __globa ...
CUDA编程之快速入门【转】
https://www.cnblogs.com/skyfsm/p/9673960.html CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架 ...
不同版本CUDA编程的问题
1 无法装上CUDA的toolkit 卸载所有的NVIDIA相关的app,包括NVIDIA的显卡驱动,然后重装. 2之前的文件打不开,one or more projects in the solut ...
cuda编程基础
转自: http://blog.csdn.net/augusdi/article/details/12529247 CUDA编程模型 CUDA编程模型将CPU作为主机,GPU作为协处理器(co-pro ...
CUDA学习笔记（一）——CUDA编程模型
转自:http://blog.sina.com.cn/s/blog_48b9e1f90100fm56.html CUDA的代码分成两部分,一部分在host(CPU)上运行,是普通的C代码:另一部分在d ...

随机推荐

看懂Oracle执行计划、表连接方式
看懂Oracle执行计划原文:https://www.cnblogs.com/Dreamer-1/p/6076440.html 最近一直在跟Oracle打交道,从最初的一脸懵逼到现在的略有所知,也 ...
GOLANG错误处理最佳方案errors wrap, Defer, Panic, and Recover
Simple error handling primitives: https://github.com/pkg/errors Defer, Panic, and Recover: ...
ssm后台开发及发布
本文详细讲解一下后台的创建及发布过程,包括踩过的坑 1:首先创建war包形式的maven工程 File>new>Maven project>Create a simple proje ...
（转）跨域的另一种解决方案——CORS（Cross-Origin Resource Sharing）跨域资源共享
在我们日常的项目开发时使用AJAX,传统的Ajax请求只能获取在同一个域名下面的资源,但是HTML5打破了这个限制,允许Ajax发起跨域的请求.浏览器是可以发起跨域请求的,比如你可以外链一个外域的图片 ...
[LeetCode] 72. Edit Distance_hard tag: Dynamic Programming
Given two words word1 and word2, find the minimum number of operations required to convert word1to w ...
Ajax学习整理笔记
AJAX技术的出现使得javascript技术大火.不懂AJAX的同学百度一下,了解AJAX能做什么就可以了. 代码: <!DOCTYPE html> <html> <h ...
linux中vim的常用方法
i 当前光标位置插入 a 当前光标后插入 0 另起一行插入 A 在光标所在行尾插入 I 在光标所在行首插入 :set nu设置行号 :set nunu 取消行号 gg 到第一行 G 到最后一行 $ ...
VS2010/MFC编程入门之四十七（字体和文本输出：CFont字体类）
上一节中鸡啄米讲了MFC异常处理,本节的主要内容是字体CFont类. 字体简介 GDI(Graphics Device Interface),图形设备接口,是Windows提供的一些函数和结构,用于在 ...
[WPF]WPF开发方法论
纵观Windows GUI应用程序开发方法,从Windows API.MFC到Visual Basic再到.NET Framework,WPF的开发方法论是在.NET Framework方法论的基础上 ...
Linux命令: 在线使用linux命令环境
https://www.tutorialspoint.com/unix_terminal_online.php

【Cuda编程】加法归约

cuda编程并行归约

AtomicAdd调用出错

gpu cpu下时间计算

加法的归约

矩阵乘法

矩阵转置

统计数目

平方和求和

分块处理

线程相邻

多block计算

【Cuda编程】加法归约的更多相关文章

随机推荐

热门专题