未知错误

未知错误

问题描述:

我是新来的OpenCL编程,并试图在C.创建以下的 “Hello World” 式的OpenCL计划未知错误

内核代码(hello.cl)

__kernel void hello(__global int* input, __global int* output, const unsigned int count) 
{ 

int i = get_global_id(0); 
if(i < count) 
    output[i] = input[i] * input[i]; 
} 

主机代码:

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 

#define DATA_SIZE (10) 

#ifdef __APPLE__ 
#include <OpenCL/opencl.h> 
#else 
#include <CL/cl.h> 
#endif 

#define MAX_SOURCE_SIZE (0x100000) 

int main() 
{ 
    size_t    count   = DATA_SIZE; 
    cl_platform_id  platform_id  = NULL; 
    cl_device_id  device_id  = NULL; 
    cl_context   context   = NULL; 
    cl_command_queue command_queue = NULL; 
    cl_mem    memobj   = NULL; 
    cl_program   program   = NULL; 
    cl_kernel   kernel   = NULL; 

    cl_uint    ret_num_devices; 
    cl_uint    ret_num_platforms; 
    cl_int    ret; 

    size_t global;      // local domain size for our calculation 
    size_t local;      // local domain size for our calculation 

    FILE    *fp; 
    char    fileName[] = "./hello.cl"; 
    char    *source_str; 
    size_t    source_size; 

    /* Load the source code containing the kernel*/ 
    fp = fopen(fileName, "r"); 
    if (!fp) 
    { 
     fprintf(stderr, "Failed to load kernel.\n"); 
     exit(1); 
    } 

    source_str = (char*)malloc(MAX_SOURCE_SIZE); 
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); 
    fclose(fp); 

    /* Get Platform and Device Info */ 
    ret    = clGetPlatformIDs  (1, &platform_id, &ret_num_platforms); 

    ret    = clGetDeviceIDs  (platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); 
    if (ret != CL_SUCCESS) 
    { 
     printf("Error: Failed to create a device group!\n"); 
     return EXIT_FAILURE; 
    } 

    /* Create OpenCL context */ 
    context   = clCreateContext  (NULL, 1, &device_id, NULL, NULL, &ret); 
    if (!context) 
    { 
     printf("Error: Failed to create a compute context!\n"); 
     return EXIT_FAILURE; 
    } 

    /* Create Command Queue */ 
    command_queue = clCreateCommandQueue (context, device_id, 0, &ret); 
    if (!command_queue ) 
    { 
     printf("Error: Failed to create a command commands!\n"); 
     return EXIT_FAILURE; 
    } 


    /* Create Kernel Program from the source */ 
    program   = clCreateProgramWithSource (context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); 
    if (!program) 
    { 
     printf("Error: Failed to create compute program!\n"); 
     return EXIT_FAILURE; 
    } 

    /* Build Kernel Program */ 
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); 
    if (ret != CL_SUCCESS) 
    { 
     size_t len; 
     char buffer[2048]; 

     printf("Error: Failed to build program executable!\n"); 
     clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); 
     printf("%s\n", buffer); 
     exit(1); 
    } 

    /* Create OpenCL Kernel */ 
    kernel   = clCreateKernel(program, "hello", &ret); 
    if (!kernel || ret != CL_SUCCESS) 
    { 
     printf("Error: Failed to create compute kernel!\n"); 
     exit(1); 
    }  

    int data[DATA_SIZE];    // original data set given to device 
    int results[DATA_SIZE];   // results returned from device 
    int i = 0; 
    for(i = 0; i < count; i++) 
     data[i] = i+1; 


    cl_mem input;      // device memory used for the input array 
    cl_mem output;      // device memory used for the output array 
    input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL); 
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * count, NULL, NULL); 
    if (!input || !output) 
    { 
     printf("Error: Failed to allocate device memory!\n"); 
     exit(1); 
    }  

    ret = clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0, sizeof(int) * count, data, 0, NULL, NULL); 
    if (ret != CL_SUCCESS) 
    { 
     printf("Error: Failed to write to source array!\n"); 
     exit(1); 
    } 

    /* Set OpenCL Kernel Parameters */ 
    ret = 0; 
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); 
    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); 
    ret |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); 
    if (ret != CL_SUCCESS) 
    { 
     printf("Error: Failed to set kernel arguments! %d\n", ret); 
     exit(1); 
    } 

    /* Execute OpenCL Kernel */ 
    ret = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); 
    if (ret != CL_SUCCESS) 
    { 
     printf("Error: ! %d\n", ret); 
     scanf("%d",&global); 
     exit(1); 
    } 

    global = (size_t) count; 
    ret    = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); 

    clFinish(command_queue); 

    /* Copy results from the memory buffer */ 
    ret = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(int) * count, results, 0, NULL, NULL); 
    if (ret != CL_SUCCESS) 
    { 
     printf("Error: Failed to read output array! %d\n", ret); 
     exit(1); 
    } 

    // Validate our results 
    // 
    int correct = 0; 
    for(i = 0; i < count; i++) 
    { 
     printf("%d-%d\n",data[i],results[i]); 
     if(results[i] == data[i] * data[i]) 
      correct++; 
    } 

    // Print a brief summary detailing the results 
    // 
    printf("Computed '%d/%d' correct values!\n", correct, count); 

    scanf("%d",&ret); 
    /* Finalization */ 
    ret = clFlush(command_queue); 
    ret = clFinish(command_queue); 
    ret = clReleaseKernel(kernel); 
    ret = clReleaseProgram(program); 
    ret = clReleaseMemObject(memobj); 
    ret = clReleaseCommandQueue(command_queue); 
    ret = clReleaseContext(context); 

    clReleaseMemObject(input); 
    clReleaseMemObject(output); 

    free(source_str); 

    return 0; 
} 

这看起来很简单的代码,但是,结果数组包含所有零值。任何人都可以告诉我我在这里犯了什么错误。该程序的输出如下:

1-0 
2-0 
3-0 
4-0 
5-0 
6-0 
7-0 
8-0 
9-0 
10-0 
Computed '0/10' correct values! 

我无法找到任何原因的GPU计算不正确的值。

+0

群体,什么是错误?这是一个构建错误?运行时错误?加载内核时出错?请详细说明。并请[请阅读如何提出良好问题](http://*.com/help/how-to-ask)。 –

+0

@Joachim Pileborg:我添加了输出。希望这会有所帮助。 –

+0

你没有检查所有的调用返回值。 – DarkZeros

您不检查clEnqueueNDRangeKernel的返回值,这几乎肯定会失败,因为您无法确保您的全局尺寸是本地尺寸的整数倍。

从查询CL_KERNEL_WORK_GROUP_SIZE结果可能是类似256,但您的全球作业大小是10你不能用10项细分工作到256

+0

非常感谢。我不知道这个事实。现在它可以工作。 –