1.新建sln（cuda）

win10 cuda_小白之旅（4）：编写第一个cuda kernel

创建成功后，会自动产生一个.cu的样本程序，我们以它为模板进行改写~如果你不确定和我是一样的，我把代码贴在下面：


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

你不用全部看懂，大体意思就是cuda没搞好报错的，你可以认为这是nvidia给予我们的一个纠错程序。

回想一下原来的三个程序：win10 cuda_小白之旅（2）地址：https://blog.csdn.net/qq_39575835/article/details/82970615

这里引入了新的东西：我先介绍一下~

P.S.记录下：个人觉得官网说的主机就是cpu，设备就是gpu。

stdio是标准io函数，比如printf和scanf函数，这个我们很熟悉，今天要说的是stdlib，stdlib里的是常用系统函数，跟系统调用相关的，比如内存申请malloc和释放free,我们在gpu上操作，自然是要跟内存打交道了。
__host__ 一般缺省，主机调用，主机执行。（说白了：主机与设备交互数据，调用device，global等等）
__device__ 设备端调用，设备端执行。
__global__主机调用，设备执行。
需要注意的是，kernel在gpu执行，所以他不能访问主机内存，只能在gpu本地内存访问，所以我们要学习cudaMalloc()函数，他类似与malloc（），使用方法也一样，这样可以为设备数组分配内存。
注意，由于是并行的，输出结果很可能无序。

接下来贴代码，请仔细品味与之前三个源程序不同。

aux_functions.h

#pragma once

void distanceArray(float *out, float *in, float ref, int len);

kernel.cu


#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include"stdio.h" //使程序可以在后台输出
#include "kernel.h"
#define TPB 32 //一个线程块包含TPB个线程

//cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
//
//__global__ void addKernel(int *c, const int *a, const int *b)
//{
//	int i = threadIdx.x;
//	c[i] = a[i] + b[i];
//}


// 注释了 放在main里了

//float scale(int i, int n)
//{
//	// 强制类型转换，使结果变为float
//	return ((float)i / n - 1);
//}

__device__
float distance(float x1, float x2)
{
	return fabsf(x1 - x2);
}

// 之前的disstanceArray
__global__
void distanceKernel(float *d_out, float *d_in, float ref)
{
	const int i = blockIdx.x*blockDim.x + threadIdx.x;//索引
	const float x = d_in[i];
	d_out[i] = distance(x, ref);
	printf("i=%2d ：dist from %f to %f is %f.\n", i, ref,x,  d_out[i]);//输出结果
}

// 主机端调用，主机端执行 缺省host
void distanceArray(float *out, float *in, float ref, int len)
{
	float *d_in = 0;
	float *d_out = 0;

	//分配空间
	cudaMalloc(&d_in, len * sizeof(float));
	cudaMalloc(&d_out, len * sizeof(float));

	//copy input data from host to device
	cudaMemcpy(d_in, in, len * sizeof(float), cudaMemcpyHostToDevice);

	//launch kernel to compute and store distance valuse
	distanceKernel <<< len / TPB, TPB >>> (d_out, d_in, ref);

	//copy resrults from device to host
	cudaMemcpy(out, d_out, len * sizeof(float), cudaMemcpyHostToDevice);

	// free
	cudaFree(d_in);
	cudaFree(d_out);
}

main.cpp

#include "kernel.h"
#include<stdlib.h>
#define N 64

float scale(int i, int n)
{
	// 强制类型转换，使结果变为float
	return ((float)i / n - 1);
}

int main()
{
	float *in = (float*)calloc(N, sizeof(float));
	float *out = (float*)calloc(N, sizeof(float));


	// choose a reference value from which distances are measured
	const float ref = 0.5f;

	for (size_t i = 0; i < N - 1; i++)
	{
		in[i] = scale(i, N);
	}
	distanceArray(out, in, ref, N);

	free(in);
	free(out);
	return 0;
}

补充说明下main.cpp中的TPB=32，N=64，则由distanceKernel <<< len / TPB, TPB >>> (d_out, d_in, ref);这是一个一维网格，本次一共保含了两个32个线程的线程块~

最后的结果如下，输出顺序是不规律的，原因详见上页第六条！

win10 cuda_小白之旅（4）：编写第一个cuda kernel

ok，第一个cu终于写完了...

欢迎大家交流，一起学习

参考：《cuda高性能并行计算》

nvidia：https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#introduction

win10 cuda_小白之旅（4）：编写第一个cuda kernel

1.新建sln（cuda）

aux_functions.h

kernel.cu

main.cpp

相关推荐