函数成员作为CUDA内核的参数

问题描述：

我正在使用动态并行机制，我想创建一个模板内核，给出一个对象指针+成员函数指针执行函数。这是一个最小（未）工作实例中，具有-arch = compute_35 -dlink标志编译，函数成员作为CUDA内核的参数

#include <iostream> 

struct A 
{ 
    int i; 
    __device__ void clear() 
    { 
     i = 0; 
    } 
}; 

template<typename Object, typename memberFunction> 
__global__ void generalKernel(Object* o, memberFunction f) 
{ 
    (o->*f)(); 
} 

template<typename Object, typename memberFunction> 
__device__ void executeFunction(Object* o, memberFunction f) 
{ 
    generalKernel<<<1,1>>>(o,f); 
    cudaDeviceSynchronize(); 
} 

__global__ void mainKernel(A* a) 
{ 
    executeFunction(a, &A::clear); 
} 

int main(int argc, char * argv[]) 
{ 
    A* a; 
    cudaMallocManaged(&a, sizeof(A)); 
    a->i = 1; 

    mainKernel<<<1,1>>>(a); 
    cudaDeviceSynchronize(); 

    std::cout << a->i << std::endl; 

    return EXIT_SUCCESS; 
}

请提供一个简短的完整示例，其他人可以尝试编译并查看该问题。还要确定你的编译命令和编译器的确切输出 –

用一个完整的例子更新:)。该错误是相当长的提供 –

答

下面是一个简单CUDA代码，以显示如何通过成员函数指针到内核。一切都在代码中解释。

#define gpuErrchk(val) \ 
    cudaErrorCheck(val, __FILE__, __LINE__, true) 
void cudaErrorCheck(cudaError_t err, char* file, int line, bool abort) 
{ 
    if(err != cudaSuccess) 
    { 
     printf("%s %s %d\n", cudaGetErrorString(err), file, line); 
     if(abort) exit(-1); 
    } 
} 


// struct holds an 'int' type data memeber and '__device__' function member 
struct ST 
{ 
    int id; 
    __device__ void foo() 
    { 
     printf("value of id: %d\n",id); 
    } 
}; 

// creating an alias for our function pointer 
// since the function is a member of a struct, we add struct name and scope resolution 'ST::' 
// to signify as such 
typedef void (ST::*Fptr)(void); 

// templated kernel 
template<typename Object, typename memberFunction> 
__global__ void kernel(Object* o, memberFunction f) 
{ 
    (o->*f)(); 
} 

// declaring a __device__ function pointer, assigning it the address of 'ST::foo' 
// remember that this function pointer is also direclty accessible from the kernel 
__device__ Fptr fp = &ST::foo; 


int main(int argc, char** argv) 
{ 
    // declaring and initializing a host 'ST' object 
    ST h_st; 
    h_st.id = 10; 


    // device 'ST' object 
    ST* d_st; 
    // allocating device memory 
    gpuErrchk(cudaMalloc((void**)&d_st, sizeof(ST))); 
    // copying host data from host object to device object 
    gpuErrchk(cudaMemcpy(d_st, &h_st, sizeof(ST), cudaMemcpyHostToDevice)); 

    // declaring host side function pointer of type 'Fptr', which can be passed to kernel as argument 
    Fptr h_fptr; 
    // copying address of '__device__' function pointer to host side function pointer 
    gpuErrchk(cudaMemcpyFromSymbol(&h_fptr, fp, sizeof(Fptr))); 


    // passing arguments to kernel 
    kernel<<<1,1>>>(d_st,h_fptr); 

    // making sure no errors occured 
    gpuErrchk(cudaPeekAtLastError()); 
    gpuErrchk(cudaDeviceSynchronize()); 

    // free device memory 
    gpuErrchk(cudaFree(d_st)); 

    return 0; 
}

我正在寻找一个适合所有的解决方案，这就是为什么我使用模板。 Typedef函数指针并且每次声明它都是编码最差的，而不是为每个要调用的函数创建一个内核。 –

阅读此如何最好地使用函数指针https://isocpp.org/wiki/faq/pointers-to-members – zindarod

我设法使用标识符别名进行编译。现在这个问题与统一内存有关，就像在这种情况下一样：https://*.com/questions/42296854/assignment-of-function-pointer-with-the-unified-memory-in-cuda。我留下这个想法，并会找到另一个。 –

函数成员作为CUDA内核的参数

相关推荐