매트릭스의 단편 처리 - CUDA

좋아, 내가 처리하고자하는 (N x N) 매트릭스가 있다고 가정 해 보겠습니다. 이 행렬은 내 컴퓨터에 상당히 큽니다. 장치에 전송하려고하면 즉시 메모리 부족 오류가 발생합니다.매트릭스의 단편 처리 - CUDA

매트릭스 섹션을 장치로 보내는 방법이 있습니까? 필자가 볼 수있는 한 가지 방법은 호스트의 매트릭스 부분을 복사 한 다음 호스트에서 장치로 이러한 관리 가능한 복사 된 부분을 보낸 다음 끝에 다시 붙여 넣는 것입니다.

여기 내가 시도했지만, for 루프의 cudaMemcpy가 오류 코드 11, '잘못된 인수'를 반환합니다.

int h_N = 10000; 
size_t h_size_m = h_N*sizeof(float); 
h_A = (float*)malloc(h_size_m*h_size_m); 

int d_N = 2500; 
size_t d_size_m = d_N*sizeof(float); 

InitializeMatrices(h_N); 

int i; 
int iterations = (h_N*h_N)/(d_N*d_N); 

for(i = 0; i < iterations; i++) 
{ 
    float* h_array_ref = h_A+(i*d_N*d_N); 
    cudasafe(cudaMemcpy(d_A, h_array_ref, d_size_m*d_size_m, cudaMemcpyHostToDevice), "cudaMemcpy"); 
    cudasafe(cudaFree(d_A), "cudaFree(d_A)"); 
}

내가 위의 코드를 달성하기 위해 노력하고있어 이것이다 : 대신 장치에 전체 매트릭스를 보내, 나는 단순히 장치에 그 행렬 및 예약 충분한 공간 내에서 장소에 대한 포인터를 보내 작업을 수행 한 다음 루프의 다음 반복을 통해 행렬 등에서 포인터를 앞쪽으로 이동하십시오.

출처

2012-11-23 t. fochtman

설명하는 방식으로 매트릭스의 타일 처리를하는 것은 가능합니다. 귀하의 현재 문제에 관해서는, 귀하의 코드에서 매트릭스 타일에 대한 cudaMalloc()을 볼 수 없습니다. 행렬의 2D 레이아웃으로 인해 각 타일의 연속 (또는 보관 규칙에 따라 행)이 인접하지 않으므로 각 타일을 cudaMemcpy2D()로 복사해야합니다. – njuffa

뿐만 아니라 문제를 쉽게이 방법으로 하위 배열로 분해했다고 가정합니다), 성능면에서 매우 유용 할 수 있습니다. 작업을 설명한 기본 접근법을 얻은 후에는 asynchronous memory copies을 사용하고 더블 버퍼링을 사용하여 메모리 전송 시간 중 일부를 이미 카드에 저장된 값을 계산하는 데 소요 된 시간과 겹치게 할 수 있습니다.

하지만 처음에는 간단하게 작동합니다. 다음은 1 차원 예제입니다 (벡터에 스칼라를 곱하고 다른 스칼라를 더함). 그러나 선형화 된 2 차원 배열을 사용하면 동일합니다. 키 부분은

CHK_CUDA(cudaMalloc(&xd, batchsize*sizeof(float))); 
CHK_CUDA(cudaMalloc(&yd, batchsize*sizeof(float))); 
tick(&gputimer); 

int nbatches = 0; 
for (int nstart=0; nstart < n; nstart+=batchsize) { 

    int size=batchsize; 
    if ((nstart + batchsize) > n) size = n - nstart; 

    CHK_CUDA(cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice)); 

    blocksize = (size+nblocks-1)/nblocks; 
    cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size); 

    CHK_CUDA(cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost)); 

    nbatches++; 
} 
gputime = tock(&gputimer); 

CHK_CUDA(cudaFree(xd)); 
CHK_CUDA(cudaFree(yd));

당신은 당신이 완료 될 때까지, 때마다, 복사를하고 커널을 시작하고 다시 복사를 통해 다음 루프를 시작할 때 버퍼를 할당하고있다. 당신은 결국 무료입니다.

전체 코드는이 하나 가져 실행

#include <stdio.h> 
#include <stdlib.h> 
#include <getopt.h> 
#include <cuda.h> 
#include <sys/time.h> 
#include <math.h> 

#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}} 

__global__ void cuda_saxpb(const float *xd, const float a, const float b, 
          float *yd, const int n) { 

    int i = threadIdx.x + blockIdx.x*blockDim.x; 
    if (i<n) { 
     yd[i] = a*xd[i]+b; 
    } 
    return; 
} 

void cpu_saxpb(const float *x, float a, float b, float *y, int n) { 

    int i; 
    for (i=0;i<n;i++) { 
     y[i] = a*x[i]+b; 
    } 
    return; 
} 

int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b); 
void tick(struct timeval *timer); 
double tock(struct timeval *timer); 

int main(int argc, char **argv) { 
    int n=1000; 
    int nblocks=10; 
    int batchsize=100; 
    float a = 5.; 
    float b = -1.; 
    int err; 
    float *x, *y, *ycuda; 
    float *xd, *yd; 
    double abserr; 
    int blocksize; 
    int i; 
    struct timeval cputimer; 
    struct timeval gputimer; 
    double cputime, gputime; 

    err = get_options(argc, argv, &n, &batchsize, &nblocks, &a, &b); 
    if (batchsize > n) { 
     fprintf(stderr, "Resetting batchsize to size of vector, %d\n", n); 
     batchsize = n; 
    } 
    if (err) return 0; 

    x = (float *)malloc(n*sizeof(float)); 
    if (!x) return 1; 

    y = (float *)malloc(n*sizeof(float)); 
    if (!y) {free(x); return 1;} 

    ycuda = (float *)malloc(n*sizeof(float)); 
    if (!ycuda) {free(y); free(x); return 1;} 

    /* run CPU code */ 

    tick(&cputimer); 
    cpu_saxpb(x, a, b, y, n); 
    cputime = tock(&cputimer); 

    /* run GPU code */ 

    /* only have to allocate once */ 
    CHK_CUDA(cudaMalloc(&xd, batchsize*sizeof(float))); 
    CHK_CUDA(cudaMalloc(&yd, batchsize*sizeof(float))); 
    tick(&gputimer); 

    int nbatches = 0; 
    for (int nstart=0; nstart < n; nstart+=batchsize) { 

     int size=batchsize; 
     if ((nstart + batchsize) > n) size = n - nstart; 

     CHK_CUDA(cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice)); 

     blocksize = (size+nblocks-1)/nblocks; 
     cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size); 

     CHK_CUDA(cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost)); 

     nbatches++; 
    } 
    gputime = tock(&gputimer); 

    CHK_CUDA(cudaFree(xd)); 
    CHK_CUDA(cudaFree(yd)); 

    abserr = 0.; 
    for (i=0;i<n;i++) { 
     abserr += fabs(ycuda[i] - y[i]); 
    } 

    printf("Y = a*X + b, problemsize = %d\n", n); 
    printf("CPU time = %lg millisec.\n", cputime*1000.); 
    printf("GPU time = %lg millisec (done with %d batches of %d).\n", 
        gputime*1000., nbatches, batchsize); 
    printf("CUDA and CPU results differ by %lf\n", abserr); 

    free(x); 
    free(y); 
    free(ycuda); 
    return 0; 
} 


int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b) { 

    const struct option long_options[] = { 
    {"nvals"  , required_argument, 0, 'n'}, 
    {"nblocks" , required_argument, 0, 'B'}, 
    {"batchsize" , required_argument, 0, 's'}, 
    {"a", required_argument, 0, 'a'}, 
    {"b", required_argument, 0, 'b'}, 
    {"help",  no_argument, 0, 'h'}, 
    {0, 0, 0, 0}}; 

    char c; 
    int option_index; 
    int tempint; 

    while (1) { 
    c = getopt_long(argc, argv, "n:B:a:b:s:h", long_options, &option_index); 
    if (c == -1) break; 

    switch(c) { 
     case 'n': tempint = atoi(optarg); 
      if (tempint < 1 || tempint > 500000) { 
      fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *n); 
      } else { 
      *n = tempint; 
      } 
      break; 

     case 's': tempint = atoi(optarg); 
      if (tempint < 1 || tempint > 50000) { 
      fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *s); 
      } else { 
      *s = tempint; 
      } 
      break; 

     case 'B': tempint = atoi(optarg); 
      if (tempint < 1 || tempint > 1000 || tempint > *n) { 
      fprintf(stderr,"%s: Cannot use number of blocks %s;\n Using %d\n", argv[0], optarg, *nb); 
      } else { 
      *nb = tempint; 
      } 
      break; 

     case 'a': *a = atof(optarg); 
      break; 

     case 'b': *b = atof(optarg); 
      break; 

     case 'h': 
      puts("Calculates y[i] = a*x[i] + b on the GPU."); 
      puts("Options: "); 
      puts(" --nvals=N  (-n N): Set the number of values in y,x."); 
      puts(" --batchsize=N (-s N): Set the number of values to transfer at a time."); 
      puts(" --nblocks=N (-B N): Set the number of blocks used."); 
      puts(" --a=X   (-a X): Set the parameter a."); 
      puts(" --b=X   (-b X): Set the parameter b."); 
      puts(" --niters=N  (-I X): Set number of iterations to calculate."); 
      puts(""); 
      return +1; 
     } 
    } 

    return 0; 
} 

void tick(struct timeval *timer) { 
    gettimeofday(timer, NULL); 
} 

double tock(struct timeval *timer) { 
    struct timeval now; 
    gettimeofday(&now, NULL); 
    return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec); 
}

입니다 :

이

$ ./batched-saxpb --nvals=10240 --batchsize=10240 --nblocks=20 
Y = a*X + b, problemsize = 10240 
CPU time = 0.072 millisec. 
GPU time = 0.117 millisec (done with 1 batches of 10240). 
CUDA and CPU results differ by 0.000000 

$ ./batched-saxpb --nvals=10240 --batchsize=5120 --nblocks=20 
Y = a*X + b, problemsize = 10240 
CPU time = 0.066 millisec. 
GPU time = 0.133 millisec (done with 2 batches of 5120). 
CUDA and CPU results differ by 0.000000 

$ ./batched-saxpb --nvals=10240 --batchsize=2560 --nblocks=20 
Y = a*X + b, problemsize = 10240 
CPU time = 0.067 millisec. 
GPU time = 0.167 millisec (done with 4 batches of 2560). 
CUDA and CPU results differ by 0.000000

GPU의 시간은 (우리가 더 많은 메모리 복사를하고있는)이 경우 상승하지만 대답은 동일하게 유지 .

편집 됨 :이 코드의 원래 버전에는 타이밍 목적으로 커널을 여러 번 반복 할 수있는 옵션이 있지만이 컨텍스트에서는 불필요하게 혼동되어 제거되었습니다.

출처

2012-11-23 20:25:33

+1 좋은 답변! –

이것이 내가 찾고 있었던 것입니다. 고마워요. –

코드의 명확성을 위해 약간의 수정에 유의하십시오. 단지 커널 시작에 대한 반복이이 상황에서 정말 혼란 스럽다는 것을 깨달았습니다. –

매트릭스의 단편 처리 - CUDA

답변

관련 문제