CUDA C++ : 숫자 64에 특별한 것이 있습니까?

내 GPU에서 작동하도록 C++ 및 CUDA에서 프로그래밍 한 수치 적 적분기와 병리학 적 증상이 있습니다. 제 통합자는 순간적으로 고정 된 단계 크기를 사용하고 있으며, 점 수 (numpoint)를 65로 통합하는 순간 (단계 크기를 1/65로 만들고 2D 배열의 너비를 65로 설정합니다. 계산 된 데이터가 65 일 때), 내 통합자는 작동하지 않으며 어딘가에있는 함수가 0을 반환하는 것처럼 보입니다. 64 배의 두 배가 넘는 2D 배열에 문제가 있습니까?CUDA C++ : 숫자 64에 특별한 것이 있습니까?

Talonmies가 쓴 매크로를 구현하려 시도했습니다. What is the canonical way to check for errors using the CUDA runtime API? 내 커널과 커널에 의해 계산 된 데이터를 호스트에 복사하는 과정에서 문제가 발생했습니다. "GPU 주장 : 잘못된 인수." 이 오류를 해석하는 방법이나 여기서부터 어디로 진행해야할지 모르겠습니다.

제 생각에 그것은 2D 배열의 너비가 64보다 크거나 피치와 관련이 있고 장치의 2D 배열에 물건을 저장하는 방법이 있습니다. 다음 코드가 2D 배열의 열을 올바르게 채 웁니까?

#include <cuda.h> 
#include <cuda_runtime.h> 
#include <device_launch_parameters.h> 
#include <stdio.h> 
#include <iostream> 
#include <iomanip>      //display 2 decimal places 
using namespace std; 

__global__ void rkf5(double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*, int*, size_t, double*, double*, double*); 
__global__ void calcK(double*, double*, double*); 
__global__ void k1(double*, double*, double*); 
__global__ void k2(double*, double*, double*); 
__global__ void k3(double*, double*, double*); 
__global__ void k4(double*, double*, double*); 
__global__ void k5(double*, double*, double*); 
__global__ void k6(double*, double*, double*); 
__global__ void arrAdd(double*, double*, double*); 
__global__ void arrMult(double*, double*, double*); 
__global__ void arrInit(double*, double); 
__device__ void setup(double , double*, double*, double*, double*, int*); 
__device__ double flux(int, double*) ; 
__global__ void storeConcs(double*, size_t, double*, int); 
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*); 
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*); 

//Error checking that I don't understand yet. 
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) exit(code); 
    } 
} 

//Main program. 
int main(int argc, char** argv) 
{ 

    //std::cout << std::fixed;   //display 2 decimal places 
    //std::cout << std::setprecision(16); //display 2 decimal places 
    const int maxlength = 1;   //Number of discrete concentrations we are tracking. 
    double concs[maxlength];   //Meant to store the current concentrations 
    double temp[maxlength];    //Used as a bin to store products of Butcher's tableau and k values. 
    double tempsum[maxlength];   //Used as a bin to store cumulative sum of tableau and k values 
    double k1s[maxlength]; 
    double k2s[maxlength]; 
    double k3s[maxlength]; 
    double k4s[maxlength]; 
    double k5s[maxlength]; 
    double k6s[maxlength]; 
    const int numpoints = 64;  
    double to = 0; 
    double tf = .5; 
    //double dt = static_cast<double>(.5)/static_cast<double>(64); 
    double dt = (tf-to)/static_cast<double>(numpoints); 
    double mo = 1; 
    double concStorage[maxlength][numpoints]; //Stores concs vs. time      

    //Initialize all the arrays on the host to ensure arrays of 0's are sent to the device. 
    //Also, here is where we can seed the system. 
    std::cout<<dt; 
    std::cout<<"\n"; 
    concs[0]=mo; 
    std::cout<<concs[0]; 
    std::cout<<" "; 
    for (int i=0; i<maxlength; i++) 
    { 
     for (int j=0; j<numpoints; j++) 
      concStorage[i][j]=0; 
     concs[i]=0; 
     temp[i]=0; 
     tempsum[i]=0; 
     k1s[i]=0; 
     k2s[i]=0; 
     k3s[i]=0; 
     k4s[i]=0; 
     k5s[i]=0; 
     k6s[i]=0; 
     std::cout<<concs[i]; 
     std::cout<<" "; 
    } 
    concs[0]=mo; 
    std::cout<<"\n"; 

    //Define all the pointers to device array memory addresses. These contain the on-GPU 
    //addresses of all the data we're generating/using. 
    double *d_concs; 
    double *d_temp; 
    double *d_tempsum; 
    double *d_k1s; 
    double *d_k2s; 
    double *d_k3s; 
    double *d_k4s; 
    double *d_k5s; 
    double *d_k6s; 
    double *d_dt; 
    int *d_maxlength; 
    int *d_numpoints; 
    double *d_to; 
    double *d_tf; 
    double *d_concStorage; 

    //Calculate all the sizes of the arrays in order to allocate the proper amount of memory on the GPU. 
    size_t size_concs = sizeof(concs); 
    size_t size_temp = sizeof(temp); 
    size_t size_tempsum = sizeof(tempsum); 
    size_t size_ks = sizeof(k1s); 
    size_t size_maxlength = sizeof(maxlength); 
    size_t size_numpoints = sizeof(numpoints); 
    size_t size_dt = sizeof(dt); 
    size_t size_to = sizeof(to); 
    size_t size_tf = sizeof(tf); 
    size_t h_pitch = numpoints*sizeof(double); 
    size_t d_pitch; 

    //Calculate the "pitch" of the 2D array. The pitch is basically the length of a 2D array's row. IT's larger 
    //than the actual row full of data due to hadware issues. We thusly will use the pitch instead of the data 
    //size to traverse the array. 
    gpuErrchk(cudaMallocPitch((void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints)); 

    //Allocate memory on the GPU for all the arrrays we're going to use in the integrator. 
    cudaMalloc((void**)&d_concs, size_concs); 
    cudaMalloc((void**)&d_temp, size_temp); 
    cudaMalloc((void**)&d_tempsum, size_tempsum); 
    cudaMalloc((void**)&d_k1s, size_ks); 
    cudaMalloc((void**)&d_k2s, size_ks); 
    cudaMalloc((void**)&d_k3s, size_ks); 
    cudaMalloc((void**)&d_k4s, size_ks); 
    cudaMalloc((void**)&d_k5s, size_ks); 
    cudaMalloc((void**)&d_k6s, size_ks); 
    cudaMalloc((void**)&d_maxlength, size_maxlength); 
    cudaMalloc((void**)&d_numpoints, size_numpoints); 
    cudaMalloc((void**)&d_dt, size_dt); 
    cudaMalloc((void**)&d_to, size_to); 
    cudaMalloc((void**)&d_tf, size_tf); 

    //Copy all initial values of arrays to GPU. 
    cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_temp, &temp, size_temp, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_tempsum, &tempsum, size_tempsum, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_k1s, &k1s, size_ks, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_k2s, &k2s, size_ks, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_k3s, &k3s, size_ks, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_k4s, &k4s, size_ks, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_k5s, &k5s, size_ks, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_k6s, &k6s, size_ks, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_maxlength, &maxlength, size_maxlength, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_numpoints, &numpoints, size_numpoints, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice); 

    //Run the integrator. 
    rkf5<<<1,1>>>(d_concs, d_concStorage, d_temp, d_tempsum, d_k1s, d_k2s, d_k3s, d_k4s, d_k5s, d_k6s, d_maxlength, d_numpoints, d_pitch, d_dt, d_to, d_tf); 
    //gpuErrchk(cudaPeekAtLastError()); 
    //gpuErrchk(cudaDeviceSynchronize()); 
    cudaDeviceSynchronize(); 

    //Copy concentrations from GPU to Host. Almost defunct now that transferring the 2D array works. 
    cudaMemcpy(concs, d_concs, size_concs, cudaMemcpyDeviceToHost); 
    //Copy 2D array of concentrations vs. time from GPU to Host. 
    gpuErrchk(cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyDeviceToHost)); 

    //Print concentrations after the integrator kernel runs. Used to test that data was transferring to and from GPU correctly. 
    std::cout << "\n"; 
    for (int i=0; i<maxlength; i++) 
    { 
     std::cout<<concs[i]; 
     std::cout<<" "; 
    } 

    //Print out the concStorage array after the kernel runs. Used to test that the 2D array transferred correctly from host to GPU and back. 
    std::cout << "\n"; 
    for (int i=0; i<maxlength; i++) 
    { 
     for(int j=0; j<numpoints; j++) 
     { 
      std::cout<<concStorage[i][j]; 
      std::cout<<" "; 
     } 
     std::cout << "\n"; 
    } 
    std::cout << "\n"; 

    cudaDeviceReset(); //Clean up all memory. 

    return 0; 
} 
//Main kernel. This is mean to be run as a master thread that calls all the other functions and thusly "runs" the integrator. 
__global__ void rkf5(double* concs, double* concStorage, double* temp, double* tempsum, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s, int* maxlength, int* numpoints, size_t pitch, double* dt, double* to, double* tf) 
{ 
    /* 
    axy variables represent the coefficients in the Butcher's tableau where x represents the order of the step and the y value corresponds to the ky value 
    the coefficient gets multiplied by. Have to cast them all as doubles, or the ratios evaluate as integers. 
    e.g. a21 -> a21 * k1 
    e.g. a31 -> a31 * k1 + a32 * k2 
    */ 
    double a21 = static_cast<double>(.25); 

    double a31 = static_cast<double>(3)/static_cast<double>(32); 
    double a32 = static_cast<double>(9)/static_cast<double>(32); 

    double a41 = static_cast<double>(1932)/static_cast<double>(2197); 
    double a42 = static_cast<double>(-7200)/static_cast<double>(2197); 
    double a43 = static_cast<double>(7296)/static_cast<double>(2197); 

    double a51 = static_cast<double>(439)/static_cast<double>(216); 
    double a52 = static_cast<double>(-8); 
    double a53 = static_cast<double>(3680)/static_cast<double>(513); 
    double a54 = static_cast<double>(-845)/static_cast<double>(4104); 

    double a61 = static_cast<double>(-8)/static_cast<double>(27); 
    double a62 = static_cast<double>(2); 
    double a63 = static_cast<double>(-3544)/static_cast<double>(2565); 
    double a64 = static_cast<double>(1859)/static_cast<double>(4104); 
    double a65 = static_cast<double>(-11)/static_cast<double>(40); 

    //for loop that integrates over the specified number of points. Actually, might have to make it a do-while loop for adaptive step sizes 
    for(int k = 0; k < *numpoints; k++) 
    { 

     calcK<<< 1, *maxlength >>>(concs, k1s, dt);   //k1 = dt * flux (concs) 
     cudaDeviceSynchronize(); //Sync here because kernel continues onto next line before k1 finished 

     setup(a21, temp, tempsum, k1s, concs, maxlength); //tempsum = a21*k1 
     arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum); //tempsum = concs + a21*k1 
     cudaDeviceSynchronize(); 

     calcK<<< 1, *maxlength >>>(tempsum, k2s, dt);  //k2 = dt * flux (concs + a21*k1) 
     cudaDeviceSynchronize(); 

     setup(a31, temp, tempsum, k1s, concs, maxlength); //tempsum = a31*k1 
     setup(a32, temp, tempsum, k2s, concs, maxlength); //tempsum = a31*k1 + a32*k2 
     arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum); //tempsum = concs + a31*k1 + a32*k2 
     cudaDeviceSynchronize(); 

     calcK<<< 1, *maxlength >>>(tempsum, k3s, dt);   //k3 = dt * flux (concs + a31*k1 + a32*k2) 
     cudaDeviceSynchronize(); 

     setup(a41, temp, tempsum, k1s, concs, maxlength); //tempsum = a41*k1 
     setup(a42, temp, tempsum, k2s, concs, maxlength); //tempsum = a41*k1 + a42*k2 
     setup(a43, temp, tempsum, k3s, concs, maxlength); //tempsum = a41*k1 + a42*k2 + a43*k3 
     arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum); //tempsum = concs + a41*k1 + a42*k2 + a43*k3 
     cudaDeviceSynchronize(); 

     calcK<<< 1, *maxlength >>>(tempsum, k4s, dt);   //k4 = dt * flux (concs + a41*k1 + a42*k2 + a43*k3) 
     cudaDeviceSynchronize(); 

     setup(a51, temp, tempsum, k1s, concs, maxlength); //tempsum = a51*k1 
     setup(a52, temp, tempsum, k2s, concs, maxlength); //tempsum = a51*k1 + a52*k2 
     setup(a53, temp, tempsum, k3s, concs, maxlength); //tempsum = a51*k1 + a52*k2 + a53*k3 
     setup(a54, temp, tempsum, k4s, concs, maxlength); //tempsum = a51*k1 + a52*k2 + a53*k3 + a54*k4 
     arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum); //tempsum = concs + a51*k1 + a52*k2 + a53*k3 + a54*k4 
     cudaDeviceSynchronize(); 

     calcK<<< 1, *maxlength >>>(tempsum, k5s, dt);   //k5 = dt * flux (concs + a51*k1 + a52*k2 + a53*k3 + a54*k4) 
     cudaDeviceSynchronize(); 

     setup(a61, temp, tempsum, k1s, concs, maxlength); //tempsum = a61*k1 
     setup(a62, temp, tempsum, k2s, concs, maxlength); //tempsum = a61*k1 + a62*k2 
     setup(a63, temp, tempsum, k3s, concs, maxlength); //tempsum = a61*k1 + a62*k2 + a63*k3 
     setup(a64, temp, tempsum, k4s, concs, maxlength); //tempsum = a61*k1 + a62*k2 + a63*k3 + a64*k4 
     setup(a65, temp, tempsum, k4s, concs, maxlength); //tempsum = a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5 
     arrAdd<<< 1, *maxlength >>>(concs, temp, tempsum); //tempsum = concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5 
     cudaDeviceSynchronize(); 

     calcK<<< 1, *maxlength >>>(tempsum, k6s, dt);   //k6 = dt * flux (concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5) 
     cudaDeviceSynchronize(); 

     //At this point, temp and tempsum are maxlength dimension arrays that are able to be used for other things. 

     /* 
     //All this is is a way of printing all my k values in a 2D array. No bearing on actual program. 
     for (int i = 0; i < *maxlength; i++) 
     { 
      switch (j) 
      { 
      case 0: concs[i]=k1s[i]; 
       break; 
      case 1: concs[i]=k2s[i]; 
       break; 
      case 2: concs[i]=k3s[i]; 
       break; 
      case 3: concs[i]=k4s[i]; 
       break; 
      case 4: concs[i]=k5s[i]; 
       break; 
      } 
     } 
     */ 
     //calcStepSize 
     takeFifthOrderStep<<< 1, *maxlength >>>(concs, k1s, k2s, k3s, k4s, k5s, k6s); 
     cudaDeviceSynchronize(); 
     storeConcs<<< 1, *maxlength >>>(concStorage, pitch, k1s, k); 
     cudaDeviceSynchronize(); 

    } 
} 
//calcStepSize will take in an error tolerance, the current concentrations and the k values and calculate the resulting step size according to the following equation 
//e[n+1]=y4[n+1] - y5[n+1] 
//__global__ void calcStepSize(double *y5, double* y4) 


//takeFourthOrderStep is going to overwrite the old temp array with the new array of concentrations that result from a 4th order step. This kernel is meant to be launched 
// with as many threads as there are discrete concentrations to be tracked. 
__global__ void takeFourthOrderStep(double* concs, double* k1s, double* k2s,double* k3s, double* k4s, double* k5s) 
{ 
    double b41 = static_cast<double>(25)/static_cast<double>(216); 
    double b42 = static_cast<double>(0); 
    double b43 = static_cast<double>(1408)/static_cast<double>(2565); 
    double b44 = static_cast<double>(2197)/static_cast<double>(4104); 
    double b45 = static_cast<double>(-1)/static_cast<double>(5); 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    concs[idx] = concs[idx] + b41 * k1s[idx] + b42 * k2s[idx] + b43 * k3s[idx] + b44 * k4s[idx] + b45 * k5s[idx]; 
} 
//takeFifthOrderStep is going to overwrite the old array of concentrations with the new array of concentrations. As of now, this will be the 5th order step. Another function can be d 
//defined that will take a fourth order step if that is interesting for any reason. This kernel is meant to be launched with as many threads as there are discrete concentrations 
//to be tracked. 
//Store b values in register? Constants? 
__global__ void takeFifthOrderStep(double* concs, double* k1s, double* k2s,double* k3s, double* k4s, double* k5s, double* k6s) 
{ 
    double b51 = static_cast<double>(16)/static_cast<double>(135); 
    double b52 = static_cast<double>(0); 
    double b53 = static_cast<double>(6656)/static_cast<double>(12825); 
    double b54 = static_cast<double>(28561)/static_cast<double>(56430); 
    double b55 = static_cast<double>(-9)/static_cast<double>(50); 
    double b56 = static_cast<double>(2)/static_cast<double>(55); 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    concs[idx] = concs[idx] + b51 * k1s[idx] + b52 * k2s[idx] + b53 * k3s[idx] + b54 * k4s[idx] + b55 * k5s[idx] + b56 * k6s[idx]; 
} 
//storeConcs takes the current array of concentrations and stores it in the cId'th column of the 2D concStorage array 
//pitch = memory size of a row 
__global__ void storeConcs(double* cS, size_t pitch, double* concs, int cId) 
{ 
    int tIdx = threadIdx.x; 
    //cS is basically the memory address of the first element of the flattened (1D) 2D array. 
    double* row = (double*)((char*)cS + tIdx * pitch); 
    row[cId] = concs[tIdx]; 
} 
//Perhaps I can optimize by using shared memory to hold conc values. 
__global__ void calcK(double* concs, double* ks, double* dt) 
{ 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    ks[idx]=(*dt)*flux(idx, concs); 
} 
//Adds two arrays (a and b) element by element and stores the result in array c. 
__global__ void arrAdd(double* a, double* b, double* c) 
{             
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    c[idx]=a[idx]+b[idx]; 
} 
//Multiplies two arrays (a and b) element by element and stores the result in array c. 
__global__ void arrMult(double* a, double* b, double* c) 
{ 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    c[idx]=a[idx]*b[idx]; 
} 
//Initializes an array a to double value b. 
__global__ void arrInit(double* a, double b) 
{ 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    a[idx]=b; 
} 
//Placeholder function for the flux calculation. It will take the size of the oligomer and current concentrations as inputs. 
__device__ double flux(int r, double *concs) 
{ 
    return -concs[r]; 
} 
//This function multiplies a tableau value by the corresponding k array and adds the result to tempsum. Used to 
//add all the a*k terms. 
__device__ void setup(double tableauValue, double *temp, double *tempsum, double *ks, double *concs, int *maxlength) 
{ 
    //Sets tempsum to tabVal * k 
    arrInit<<< 1, *maxlength >>>(temp, tableauValue);  //Set [temp] to tableau value 
    cudaDeviceSynchronize(); 
    arrMult<<< 1, *maxlength >>>(ks, temp, temp);   //Multiply tableau value by appropriate [k] 
    cudaDeviceSynchronize(); 
    arrAdd<<< 1, *maxlength >>>(tempsum, temp, tempsum); //Move tabVal*k to [tempsum] 
    cudaDeviceSynchronize(); 
} 

/* 
__device__ double knowles_flux(int r, double *conc, double *params) 
{ 

    const double nc = params[0]; 
    const double ka = params[1]; 
    //const float kb = params[2]; 
    //const float kp = params[3]; 
    const double km = params[4]; 
    const double kn = params[5]; 
    //const float n2 = params[6]; 
    //const float kn2 = params[7]; 
    const int maxlength = params[8]; 


    const int r = blockIdx.x*blockDim.x + threadIdx.x; 

    double frag_term = 0; 
    double flux = 0; 
    if (r == (maxlength-1)) 
     { 
     flux = -km*(r)*conc[r]+2*ka*conc[r-1]*conc[0]; 
     } 
    else if (r > (nc-1)) 
     { 
     for (int s = r+1; s < maxlength; s++) 
      { 
      frag_term += conc[s]; 
      } 
     //double frag_term = thrust::reduce(conc, conc); 
     flux = -km*(r)*conc[r] + 2*km*frag_term - 2*ka*conc[r]*conc[0] + 2*ka*conc[r-1]*conc[0]; 
     } 
    else if (r == (nc-1)) 
     { 
     for (int s = r+1; s < maxlength; s++) 
      { 
      frag_term += conc[s]; 
      } 
     //double frag_term = thrust::reduce(conc, conc); 
     flux = kn*pow(conc[0],nc) + 2*km*frag_term - 2*ka*conc[r]*conc[0]; 
     } 
    else if (r < (nc-1)) 
     { 
     flux[r] = 0; 
     } 
} 
*/ 

/* 
Encountered Errors : 
1. nvlink - undefined reference : there's a mismatch between function protoypes and function declaration, possibly the number of arguments. 
"nvlink : error : Undefined reference to '_Z2k1PdS_' in 'x64/Debug/RKF5 Prototype 2.cu.obj'" - fixed by making sure func def had same parameters as proto 

2.1>nvlink : error : Undefined reference to '_Z2k1PdS_' in 'x64/Debug/RKF5 Prototype 2.cu.obj' 
1>nvlink : error : Undefined reference to '_Z2k2PdS_' in 'x64/Debug/RKF5 Prototype 2.cu.obj' 
1>nvlink : error : Undefined reference to '_Z2k3PdPiS_S_S_S_S_' in 'x64/Debug/RKF5 Prototype 2.cu.obj' 
1>nvlink : error : Undefined reference to '_Z2k4PdPiS_S_S_S_S_S_' in 'x64/Debug/RKF5 Prototype 2.cu.obj' 
1>nvlink : error : Undefined reference to '_Z2k5PdPiS_S_S_S_S_S_S_' in 'x64/Debug/RKF5 Prototype 2.cu.obj' 

This is caused by there being references defined in the prototype that don't exist in the actual function definition. 
*/

출처

2013-11-04 Karsten Chu

완전한 예를 들어 줄 필요가 있습니다. 내가 아는 한 64와 관련된 특별한 것이 없다. –

OK, 코드를 입력합니다.출력은 커널이 어레이에서 작동 할 수 없음을 나타내는 0의 배열입니다. numpoint를 64로 변경하면 통합됩니다. –

cudaMallocPitch()의 문제는'cudaMallocPitch ((void **) & d_concStorage, d_pitch, numpoints * sizeof (double), maxlength) 여야합니다.' –

당신이 적절한 CUDA 오류 검사를 구현하지 않은 우선. 재교육에 의한 글을 이해하기 위해해야 할 일을 다시 읽고 공부하십시오. 그 각각의 모든cudaMalloc 후 의미, 그래서 당신은, 모든 CUDA API 호출 후 매크로를 검사 오류를 둘 필요, cudaMemcpy 등이 라인을 배치해야합니다 : 또는

gpuErrchk(cudaPeekAtLastError());

을, 당신은 단순히 모든 포장 할 수 있습니다 에러 체크 매크로 호출이 작업을 수행하지 않는 경우가 종종 오류가 발생했습니다 다른 코드 (이전) 라인에 적용하기 때문에

gpuErrchk(cudaMemcpy(concs, d_concs, size_concs, cudaMemcpyDeviceToHost));

는, 그래, 오류 메시지는 혼란 스러울 것입니다.

당신이 고체 오류 검사를 수행하면 잘못된 인수가이 코드 줄에서 발생되는 것을 발견 할 것이다 :

cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyHostToDevice);

우리가 만든 해당 malloc에 작업과 비교해야하는 이유 이해하기 원래 변수 정의에서, numpoints가 폭 매개 변수입니다

gpuErrchk(cudaMallocPitch((void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints));

참고 :이 라인 d_concStorage. 그러나 위의 cudaMallocPitch에서는 numpoints을 마지막 매개 변수 인 height 매개 변수로 전달합니다. cuda API documentation을 참조하십시오.

이러한 호출을 서로 조화 시키려면 height 및 width 매개 변수가 필요합니다. 내가 할 수있는 올바른 일이 그렇게 같은 cudaMemcpy2D 동작을 일치하게하려면 cudaMallocPitch 라인을 수정하는 것입니다 생각 : 다음 65 numpoints을 설정할 수 있습니다, 내가 그 변경하면

gpuErrchk(cudaMallocPitch((void**)&d_concStorage, &d_pitch, numpoints * sizeof(double), maxlength));

를 컴파일하고 실행하여 API 오류가없는 코드.

출처

2013-11-04 03:42:53

아아 ... 당신을 사는 작은 일입니다. 도와 주셔서 감사합니다. 오류 검사에 관해 당신이 말하는 것을 봅니다. 내가 프로그램을 처음 작성했을 때 사용하지 않았기 때문에 나는 그것에 질려 버렸다. 솔직히, Visual Studio에서 처리 한 것이라고 생각했기 때문에 필자는 필자가 필 요할 필요가 있음을 알지 못했습니다. –

CUDA C++ : 숫자 64에 특별한 것이 있습니까?

답변

관련 문제