函数成员作为CUDA内核的参数
我正在使用动态并行机制,我想创建一个模板内核,给出一个对象指针+成员函数指针执行函数。这是一个最小(未)工作实例中,具有-arch = compute_35 -dlink标志编译,函数成员作为CUDA内核的参数
#include <iostream>
struct A
{
int i;
__device__ void clear()
{
i = 0;
}
};
template<typename Object, typename memberFunction>
__global__ void generalKernel(Object* o, memberFunction f)
{
(o->*f)();
}
template<typename Object, typename memberFunction>
__device__ void executeFunction(Object* o, memberFunction f)
{
generalKernel<<<1,1>>>(o,f);
cudaDeviceSynchronize();
}
__global__ void mainKernel(A* a)
{
executeFunction(a, &A::clear);
}
int main(int argc, char * argv[])
{
A* a;
cudaMallocManaged(&a, sizeof(A));
a->i = 1;
mainKernel<<<1,1>>>(a);
cudaDeviceSynchronize();
std::cout << a->i << std::endl;
return EXIT_SUCCESS;
}
下面是一个简单CUDA代码,以显示如何通过成员函数指针到内核。一切都在代码中解释。
#define gpuErrchk(val) \
cudaErrorCheck(val, __FILE__, __LINE__, true)
void cudaErrorCheck(cudaError_t err, char* file, int line, bool abort)
{
if(err != cudaSuccess)
{
printf("%s %s %d\n", cudaGetErrorString(err), file, line);
if(abort) exit(-1);
}
}
// struct holds an 'int' type data memeber and '__device__' function member
struct ST
{
int id;
__device__ void foo()
{
printf("value of id: %d\n",id);
}
};
// creating an alias for our function pointer
// since the function is a member of a struct, we add struct name and scope resolution 'ST::'
// to signify as such
typedef void (ST::*Fptr)(void);
// templated kernel
template<typename Object, typename memberFunction>
__global__ void kernel(Object* o, memberFunction f)
{
(o->*f)();
}
// declaring a __device__ function pointer, assigning it the address of 'ST::foo'
// remember that this function pointer is also direclty accessible from the kernel
__device__ Fptr fp = &ST::foo;
int main(int argc, char** argv)
{
// declaring and initializing a host 'ST' object
ST h_st;
h_st.id = 10;
// device 'ST' object
ST* d_st;
// allocating device memory
gpuErrchk(cudaMalloc((void**)&d_st, sizeof(ST)));
// copying host data from host object to device object
gpuErrchk(cudaMemcpy(d_st, &h_st, sizeof(ST), cudaMemcpyHostToDevice));
// declaring host side function pointer of type 'Fptr', which can be passed to kernel as argument
Fptr h_fptr;
// copying address of '__device__' function pointer to host side function pointer
gpuErrchk(cudaMemcpyFromSymbol(&h_fptr, fp, sizeof(Fptr)));
// passing arguments to kernel
kernel<<<1,1>>>(d_st,h_fptr);
// making sure no errors occured
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
// free device memory
gpuErrchk(cudaFree(d_st));
return 0;
}
我正在寻找一个适合所有的解决方案,这就是为什么我使用模板。 Typedef函数指针并且每次声明它都是编码最差的,而不是为每个要调用的函数创建一个内核。 –
阅读此如何最好地使用函数指针https://isocpp.org/wiki/faq/pointers-to-members – zindarod
我设法使用标识符别名进行编译。现在这个问题与统一内存有关,就像在这种情况下一样:https://*.com/questions/42296854/assignment-of-function-pointer-with-the-unified-memory-in-cuda。我留下这个想法,并会找到另一个。 –
请提供一个简短的完整示例,其他人可以尝试编译并查看该问题。还要确定你的编译命令和编译器的确切输出 –
用一个完整的例子更新:)。该错误是相当长的提供 –