win10 cuda_小白之旅(4):编写第一个cuda kernel
博主自己学习,仅此记录,并方便学过c、已经配置好cuda的朋友交流学习。(我机子cuda9.0)
我们在win10 cuda_小白之旅(2)用c写了三个程序,现在我们将他们用cuda的方式进行书写~~
win10 cuda_小白之旅(2)地址:https://blog.csdn.net/qq_39575835/article/details/82970615
1.新建sln(cuda)
创建成功后,会自动产生一个.cu的样本程序,我们以它为模板进行改写~如果你不确定和我是一样的,我把代码贴在下面:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
你不用全部看懂,大体意思就是cuda没搞好报错的,你可以认为这是nvidia给予我们的一个纠错程序。
回想一下原来的三个程序:win10 cuda_小白之旅(2)地址:https://blog.csdn.net/qq_39575835/article/details/82970615
这里引入了新的东西:我先介绍一下~
P.S.记录下:个人觉得官网说的主机就是cpu,设备就是gpu。
- stdio是标准io函数,比如printf和scanf函数,这个我们很熟悉,今天要说的是stdlib,stdlib里的是常用系统函数,跟系统调用相关的,比如内存申请malloc和释放free,我们在gpu上操作,自然是要跟内存打交道了。
- __host__ 一般缺省,主机调用,主机执行。(说白了:主机与设备交互数据,调用device,global等等)
- __device__ 设备端调用,设备端执行。
- __global__主机调用,设备执行。
- 需要注意的是,kernel在gpu执行,所以他不能访问主机内存,只能在gpu本地内存访问,所以我们要学习cudaMalloc()函数,他类似与malloc(),使用方法也一样,这样可以为设备数组分配内存。
- 注意,由于是并行的,输出结果很可能无序。
接下来贴代码,请仔细品味与之前三个源程序不同。
aux_functions.h
#pragma once
void distanceArray(float *out, float *in, float ref, int len);
kernel.cu
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include"stdio.h" //使程序可以在后台输出
#include "kernel.h"
#define TPB 32 //一个线程块包含TPB个线程
//cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
//
//__global__ void addKernel(int *c, const int *a, const int *b)
//{
// int i = threadIdx.x;
// c[i] = a[i] + b[i];
//}
// 注释了 放在main里了
//float scale(int i, int n)
//{
// // 强制类型转换,使结果变为float
// return ((float)i / n - 1);
//}
__device__
float distance(float x1, float x2)
{
return fabsf(x1 - x2);
}
// 之前的disstanceArray
__global__
void distanceKernel(float *d_out, float *d_in, float ref)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;//索引
const float x = d_in[i];
d_out[i] = distance(x, ref);
printf("i=%2d :dist from %f to %f is %f.\n", i, ref,x, d_out[i]);//输出结果
}
// 主机端调用,主机端执行 缺省host
void distanceArray(float *out, float *in, float ref, int len)
{
float *d_in = 0;
float *d_out = 0;
//分配空间
cudaMalloc(&d_in, len * sizeof(float));
cudaMalloc(&d_out, len * sizeof(float));
//copy input data from host to device
cudaMemcpy(d_in, in, len * sizeof(float), cudaMemcpyHostToDevice);
//launch kernel to compute and store distance valuse
distanceKernel <<< len / TPB, TPB >>> (d_out, d_in, ref);
//copy resrults from device to host
cudaMemcpy(out, d_out, len * sizeof(float), cudaMemcpyHostToDevice);
// free
cudaFree(d_in);
cudaFree(d_out);
}
main.cpp
#include "kernel.h"
#include<stdlib.h>
#define N 64
float scale(int i, int n)
{
// 强制类型转换,使结果变为float
return ((float)i / n - 1);
}
int main()
{
float *in = (float*)calloc(N, sizeof(float));
float *out = (float*)calloc(N, sizeof(float));
// choose a reference value from which distances are measured
const float ref = 0.5f;
for (size_t i = 0; i < N - 1; i++)
{
in[i] = scale(i, N);
}
distanceArray(out, in, ref, N);
free(in);
free(out);
return 0;
}
补充说明下main.cpp中的TPB=32,N=64,则由distanceKernel <<< len / TPB, TPB >>> (d_out, d_in, ref);这是一个一维网格,本次一共保含了两个32个线程的线程块~
最后的结果如下,输出顺序是不规律的,原因详见上页第六条!
ok,第一个cu终于写完了...
欢迎大家交流,一起学习
参考:《cuda高性能并行计算》
nvidia:https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#introduction