CUDA는 : 커널 호출

후 메모리에 데이터 손상이 코드를보고하십시오 :CUDA는 : 커널 호출

#include <stdlib.h> 
#include <stdio.h> 

int N, L, I; 
float * inputs; 
float * temp; 

// first kernel 
__global__ void mulKernel (float * output, float * inputs)///, float * weights) 
{ 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 

    output [idx] = inputs [idx] * 3;//weights [idx]; 
    //weights [idx] = 4; 

    //__syncthreads(); 
} 

//second kernel 
__global__ void sumKernel (float * output, float * input) 
{ 
    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    output [idx] = input[idx]*2; 

    __syncthreads(); 
} 

void printVector (const float *p, const int N) { 
    for (int i=0; i<N; i++) 
    printf("%f\n",p[i]); 
} 

int main(int argc, char *argv[]) 
{ 
    if(argc < 3) 
     printf("Usage: cuda <layers> <inputs>\n"); 
    else 
    { 
     L = atoi(argv[1]); 
     N = atoi(argv[2]); 
     I = atoi(argv[2]); 
     inputs = (float*)malloc(I*sizeof(float)); 
     float * weights = (float*)malloc(I*sizeof(float)); 

     // and fill with some arbitrary values 
     for (int i=0; i<I; i++) 
     { 
      inputs[i] = 1; 
     } 
     for (int i=0; i<I; i++) 
     { 
      weights[i] = 1.5; 
     } 

     // allocate device memory 
     float * devInputs = NULL; 
     float * devTemp = NULL; 
     float * devWeights = NULL; 

     cudaMalloc ((void**)&devInputs, I*sizeof(float)); 
     cudaMalloc ((void**)&devTemp, I*sizeof(float)); 
     cudaMalloc ((void**)&devWeights, I*sizeof(float)); 

     // set kernel launch configuration 
     dim3 threadsMul = dim3(512, 1); 
     int blocksCount = floor(I/threadsMul.x) + 1; 
     dim3 blocksMul = dim3(blocksCount, 1); 

     dim3 threadsSum = dim3(512, 1); 
     blocksCount = floor(I/threadsSum.x) + 1; 
     dim3 blocksSum = dim3(blocksCount, 1); 

     cudaMemcpy  (devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice); 
     cudaMemcpy  (devWeights, weights,I*sizeof(float), cudaMemcpyHostToDevice); 

     //kernels calling in this cycle 
     for(int j=0;j<L;j++) 
     { 
      // copying data to see that's ok 
      cudaMemcpy  (inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost); 
      cudaMemcpy  (weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost); 

      // print it 
      printf("inputs:\n"); 
      printVector (inputs, N); 
      printf("weights:\n"); 
      printVector (weights, N); 
      printf("\n"); 

      // running first kernel 
      mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs);//, devWeights); 

      // copying and printing data. We can see thats array weights contains a wrong values 
      cudaMemcpy  (inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost); 
      cudaMemcpy  (weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost); 

      printf("inputs:\n"); 
      printVector (inputs, N); 
      printf("weights:\n"); 
      printVector (weights, N); 
      printf("\n"); 

      if(cudaDeviceSynchronize() == cudaSuccess) 
      printf("threads syncronized\n"); 

      cudaMemcpy  (inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost); 
      cudaMemcpy  (weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost); 

      printf("inputs:\n"); 
      printVector (inputs, N); 
      printf("weights:\n"); 
      printVector (weights, N); 
      printf("\n"); 

      sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp); 

      cudaMemcpy  (inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost); 
      cudaMemcpy  (weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost); 

      printf("inputs:\n"); 
      printVector (inputs, N); 
      printf("weights:\n"); 
      printVector (weights, N); 
      printf("\n\n"); 

      if(cudaDeviceSynchronize() == cudaSuccess) 
      printf("threads syncronized\n"); 

      cudaMemcpy  (inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost); 
      cudaMemcpy  (weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost); 

      printf("inputs:\n"); 
      printVector (inputs, N); 
      printf("weights:\n"); 
      printVector (weights, N); 
      printf("\n\n"); 
     } 

     cudaMemcpy  (inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost); 

     cudaFree   (devInputs ); 
     cudaFree   (devTemp ); 
     cudaFree   (devWeights ); 

     printVector (inputs, N); 

     free(inputs); 
     free(weights); 
    } 
    return 0; 
}

을 그리고 출력을 본다. 첫 번째 커널을 호출 한 후 devWeights 배열의 데이터가 손실되었습니다. 그러나 그것은 어디에서도 사용되지 않습니다. 나는 그것을 메모리에 복사하고, 커널을 실행하며 (영향을 미치지 않음) 호스트로 다시 복사합니다. 그리고 출력에서 나는 그것이 바뀌는 것을 봅니다. 왜? 내가 도대체 뭘 잘못하고있는 겁니까?

주 기능에서는주기를 확인할 수 있습니다. 그것에서는 두 개의 커널, sumKernel과 mulKernel을 실행합니다. 커널을 실행하기 전에, 그 이후, 그리고 동기화 된 스레드들 후에 배열을 복사하여 호스트하고 인쇄합니다. 그래서 커널을 호출 한 후에 잘못된 데이터를 보게됩니다. 코드의 주석을 참조하십시오.

오류 (cudaSuccess 만 표시)가 표시되지 않습니다.

출처

2012-04-29 Robotex

CUDA 오류는 전혀 확인하지 않습니다. 오류를 확인하지 않으면 잘못된 것이 있는지 결코 알 수 없습니다. – harrism

커널 호출 코드가 없으면 그냥 추측 할 수 있습니다. 어떻게 바뀌 었습니까? 정확한 인수로 올바른 순서로 커널을 호출 했습니까? – djmj

"출력물을 봐"라고 우리에게 요청합니다. 하지만 코드의 모양이나 문제가 발생할 때 사용하는 명령 줄 인수의 값에 대해 설명하지 않았습니다. 다른 사람이 귀하의 * 코드가 무엇을해야 하는지를 알고 그 문제를 설명하기 위해 노력하지 않거나 문제를 일으키기 위해 코드를 어떻게 실행해야하는지 알려주는 방법은 무엇입니까? – talonmies

오, 오류를 발견했습니다. 내 커널에서 (idx < N)을 사용하고 CUDA가 배열 차원을 벗어날 때 오류를 인쇄하지 않았다면 사용하지 않았습니다. 그래서, 입력 배열을 변경했을 때, 입력 후에 메모리에있는 데이터도 변경했습니다.

출처

2012-05-01 13:52:01 Robotex

CUDA는 : 커널 호출

답변

관련 문제